In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start Spark Session with Spark NLP
# start() functions has two parameters: gpu and spark23
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
spark = sparknlp.start()

In [2]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 2.7.3
Apache Spark version: 2.4.4


In [3]:
# text files with the input

import csv
import pandas as pd
df = pd.read_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt",delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])

In [37]:
df.head()

Unnamed: 0,Word,POS,DEREP,TYPE,SENT_NO
0,But,CC,(TOP(S*,O,1
1,%um,UH,(INTJ*),O,1
2,",",",",*,O,1
3,guessed,VBD,(VP*,O,1
4,what,WP,(SBAR(WHNP*),O,1


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10976 entries, 0 to 10975
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Word     10976 non-null  object
 1   POS      10976 non-null  object
 2   DEREP    10976 non-null  object
 3   TYPE     10976 non-null  object
 4   SENT_NO  10976 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 428.9+ KB


In [5]:
words = []
for i in range(df.shape[0]):
    if df.iloc[i,:].TYPE == 'B-PERSON' or df.iloc[i,:].TYPE == 'I-PERSON':
        words.append("Dodo")
    else:
        words.append(df.iloc[i,:].Word)

In [6]:
len(words)

32488

In [7]:
df.shape

(32488, 5)

In [8]:
words.count("Dodo")

644

In [9]:
df.iloc[1,:].Word

'basically'

In [10]:
df.Word = words

In [15]:
for k, v in df.groupby("SENT_NO").groups.items():
    temp = df.iloc[v,:]
    temp.to_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb2.txt",mode='a',sep="\t",index=False,header=False)
    with open("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb2.txt","a") as f:
        f.write("\n")

In [11]:
df.to_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb1.txt",sep="\t",index=False,header=False)

In [22]:
import time
import pathlib
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.txt") if str(f).endswith("_ner.txt")]

In [23]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt']

In [24]:
flist.remove('/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt')

In [25]:
for f in flist:
    print(f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])
    words = []
    for i in range(df.shape[0]):
        if df.iloc[i,:].TYPE == 'B-PERSON' or df.iloc[i,:].TYPE == 'I-PERSON':
            words.append("Dodo")
        else:
            words.append(df.iloc[i,:].Word)
    print(words.count("Dodo"))
    print(len(words))
    print(df.shape[0])
    df.Word = words
    new_filename = f.replace(".txt","_perturb2.txt")
    for k, v in df.groupby("SENT_NO").groups.items():
        temp = df.iloc[v,:]
        temp.to_csv(new_filename,mode='a',sep="\t",index=False,header=False)
        with open(new_filename,"a") as f:
            f.write("\n")

/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt
792
49235
49235
/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt
679
18945
18945
/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt
449
17875
17875
/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt
0
16851
16851
/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt
726
23209
23209
/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt
110
10976
10976


In [26]:
df.TYPE.unique()

array(['O', 'B-DATE', 'I-DATE', 'B-PERSON', 'B-GPE', 'I-GPE', 'B-TIME',
       'B-CARDINAL', 'B-NORP', 'I-CARDINAL', 'I-TIME', 'B-ORDINAL',
       'B-PRODUCT', 'B-ORG', 'B-PERCENT', 'I-PERCENT', 'B-LANGUAGE',
       'I-PERSON', 'I-ORG', 'B-FAC', 'I-FAC', 'B-MONEY', 'I-MONEY',
       'I-PRODUCT'], dtype=object)

In [27]:
embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \
      .setInputCols("sentence","token") \
      .setOutputCol("embeddings")

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [28]:
ner_onto = NerDLModel.pretrained("onto_bert_base_cased", "en") \
        .setInputCols(["document", "token", "embeddings"]) \
        .setOutputCol("ner")

onto_bert_base_cased download started this may take some time.
Approximate size to download 15.5 MB
[OK!]


In [29]:
documentAssembler = DocumentAssembler().setInputCol("text")\
                     .setOutputCol("document")\
                     .setCleanupMode("shrink")

In [30]:
tokenizer = Tokenizer().setInputCols(["sentence"])\
                          .setOutputCol("token")

In [31]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document").setCleanupMode("shrink")

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings, ner_onto])

In [32]:
pipeline_model = nlp_pipeline.fit(spark.createDataFrame([['']]).toDF('text'))


In [35]:
import time
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.*") if "perturb1" in str(f)]

In [36]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb1.txt']

In [27]:
for f in flist:
    print("doing for: " + f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE,header=None,encoding='utf-8',names=["Word","POS","DEREP","TYPE","SENT_NO"])
    sentences = []
    entities = []
    entities_type = []
    print("there are {} sentences".format(len(df.groupby("SENT_NO").groups.items())))
    count = 1
    start = time.time()
    for _,v in df.groupby("SENT_NO").groups.items():
        temp1 = []
        temp2 = []
        temp3 = []
        if count%1000 == 0:
            print("Done for {} in {} seconds".format(count,time.time()-start))
        for i,t in enumerate(df.iloc[v,:].Word.tolist()):
            if i < (len(df.iloc[v,:].Word.tolist())-1):
                if df.iloc[v,:].Word.tolist()[i][0].isalnum() and not(df.iloc[v,:].Word.tolist()[i+1][0].isalnum()):
                    temp1.append(t + df.iloc[v,:].Word.tolist()[i+1])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i+1])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i+1])
                elif not(df.iloc[v,:].Word.tolist()[i][0].isalnum()):
                    continue
                else:
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
            elif i == (len(df.iloc[v,:].Word.tolist())-1):
                if t[0].isalnum():
                    temp1.append(t)
                    temp2.append(df.iloc[v,:].TYPE.tolist()[i])
                    temp3.append(df.iloc[v,:].Word.tolist()[i])
        sentences.append(" ".join(temp3))
        entities_type.append(temp2)
        entities.append(temp3)
        count += 1
    sentences =  [s for s in sentences if s != ""]
    entities_type =  [s for s in entities_type if s != []]
    entities =  [s for s in entities if s != []]
    detected_sent = []
    detected_ner = []
    detected_ner_type = []
    start = time.time()
    for sent in sentences:
        df = spark.createDataFrame(pd.DataFrame({'text':[sent]}))
        result = pipeline_model.transform(df)    
        df_result = result.select("sentence","ner").collect()
    
        detected_sent.append([df_result[0][0][0].result])
        ners = []
        ner_types = []
        for d in df_result[0][1]:
            ners.append(d.metadata["word"])
            ner_types.append(d.result)
        detected_ner.append(ners)
        detected_ner_type.append(ner_types)
    print("Done processing in {} seconds".format(time.time()-start))
    pd.DataFrame({"Detected_sentence":detected_sent,"Actual_ners":entities,"Detected_ners":detected_ner,"Actual_ner_types":entities_type,"Detected_ner_types":detected_ner_type}).to_csv("detect_" + pathlib.Path(f).name.split(".")[0] + "_perturb1.csv",index = False)   

doing for: /Users/ramybal/Downloads/bio/spark/test/onto_bn_ner_perturb1.txt
there are 1252 sentences
Done for 1000 in 28.919176816940308 seconds
Done processing in 220.06137013435364 seconds
doing for: /Users/ramybal/Downloads/bio/spark/test/onto_nw_ner_perturb1.txt
there are 1898 sentences
Done for 1000 in 35.327131032943726 seconds
Done processing in 331.36916995048523 seconds
doing for: /Users/ramybal/Downloads/bio/spark/test/onto_wb_ner_perturb1.txt
there are 929 sentences
Done processing in 166.31479001045227 seconds
doing for: /Users/ramybal/Downloads/bio/spark/test/onto_tc_ner_perturb1.txt
there are 1366 sentences
Done for 1000 in 12.347544193267822 seconds
Done processing in 200.78535294532776 seconds
doing for: /Users/ramybal/Downloads/bio/spark/test/onto_mz_ner_perturb1.txt
there are 780 sentences
Done processing in 152.22356867790222 seconds
doing for: /Users/ramybal/Downloads/bio/spark/test/onto_pt_ner_perturb1.txt
there are 1217 sentences
Done for 1000 in 20.62511992454528

In [28]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner_perturb1.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb1.txt']

In [32]:
path = pathlib.Path("/Users/ramybal/Desktop/untitled folder/neuroner/")
flist = [str(f) for f in path.rglob("detect*_perturb1.csv")]
f = flist[0]

In [33]:
f

'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner_perturb1_perturb1.csv'

In [34]:
df = pd.read_csv(f)

In [35]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_types
0,['Genteel Journalis --'],"['Genteel', 'Journalis', '--']","['Genteel', 'Journalis', '--']","['O', 'O', 'O']","['O', 'O', 'O']"
1,['Dodo Dodo'],"['Dodo', 'Dodo']","['Dodo', 'Dodo']","['B-PERSON', 'I-PERSON']","['O', 'O']"
2,['Dodo Dodo / tr.'],"['Dodo', 'Dodo', '/', 'tr.', 'by', 'Dodo', 'Do...","['Dodo', 'Dodo', '/', 'tr', '.', 'by', 'Dodo',...","['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'B-PER...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,['Dodo Dodo is the doyen of the Chinese news m...,"['Dodo', 'Dodo', 'is', 'the', 'doyen', 'of', '...","['Dodo', 'Dodo', 'is', 'the', 'doyen', 'of', '...","['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', '...","['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B..."
4,['Still out on the journalistic front lines at...,"['Still', 'out', 'on', 'the', 'journalistic', ...","['Still', 'out', 'on', 'the', 'journalistic', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DA...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DA..."


In [36]:
import yaml

df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [37]:
len(unequal)

179

In [38]:
def multicount(searchlist,target):
    temp = []
    count = 0
    while(count<len(searchlist)):
        if searchlist[count].lower() == target.lower():
            temp.append(count)
        count += 1
    return temp
 
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and df.iloc[i,:].Detected_ners[j+1] == "s" and d == "'"):
            detect_ner_temp.append("'s")
            detect_type_temp.append("O")
            flag = 1
        elif d == "s" and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [39]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [40]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [41]:
len(unequal)

77

In [42]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and df.iloc[i,:].Detected_ners[j+1] == "." and "." in d):
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [43]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [44]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [45]:
len(unequal)

78

In [46]:
for i in unequal[:4]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Dodo', 'Dodo', '/', 'tr.', 'by', 'Dodo', 'Dodo', '-RRB-']
['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O']
detected
['Dodo', 'Dodo', '/', 'tr', '.', 'by', 'Dodo', 'Dodo', '-', 'RRB', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Now', 'that', 'I', "'m", 'in', 'my', '80s', ',', 'looking', 'back', 'I', 'think', 'I', 'can', 'say', 'I', "'ve", 'done', 'alright', 'in', 'both', 'respects', ',', 'and', 'I', "'ve", 'learnt', 'that', 'both', 'reporting', 'the', 'news', 'well', ',', 'and', 'finding', 'favor', 'with', 'women', ',', 'depend', 'on', 'your', 'character', 'and', 'conduct', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['Now', 'that', 'I', "'", 'm', 'in', 'my', '80s', 

In [47]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [48]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [49]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [50]:
len(unequal)

68

In [51]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [52]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [53]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [54]:
len(unequal)

59

In [55]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [56]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [57]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [58]:
len(unequal)

30

In [59]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [60]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [61]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [62]:
len(unequal)

29

In [63]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Dodo', 'Dodo', '/', 'tr.', 'by', 'Dodo', 'Dodo', '-RRB-']
['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O']
detected
['Dodo', 'Dodo', '/', 'tr.', 'by', 'Dodo', 'Dodo', '-', 'RRB', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['In', 'a', 'November', '28', 'press', 'release', ',', 'the', 'Mainland', 'Affairs', 'Council', '-LRB-', 'MAC', '-RRB-', 'the', 'government', 'body', 'in', 'charge', 'of', 'cross-strait', 'affairs', ',', 'diplomatically', 'stated', 'that', 'it', 'would', 'observe', 'and', 'implement', 'the', 'suggestions', 'of', 'the', 'Advisory', 'Group', 'if', 'and', 'when', 'they', 'become', 'official', 'government', 'policy', '.']
['O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O',

In [64]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LRB","RRB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [65]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [66]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [67]:
len(unequal)

1

In [68]:
i = unequal[0]
print("actual")
print(df.iloc[i,:].Actual_ners)
print(df.iloc[i,:].Actual_ner_types)
print("detected")
print(df.iloc[i,:].Detected_ners)
print(df.iloc[i,:].Detected_ner_types)
print("-----------------------------------------------------------")

actual
['For', 'more', 'detailed', 'information', 'visit', 'http://www.tamsui.gov.tw/', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['For', 'more', 'detailed', 'information', 'visit', 'http://www.tamsui.gov.tw/.']
['O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------


In [69]:
for i in unequal:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [71]:
from seqeval.metrics import accuracy_score, f1_score
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_mz_ner_perturb1_perturb1.csv
accuracy score
0.9678591709256105
0.8338368580060423


In [72]:
f = flist[1]
print(f)

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner_perturb1_perturb1.csv


In [73]:
df = pd.read_csv(f)

In [74]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_types
0,['But %um guessed what happened .'],"['But', '%um', 'guessed', 'what', 'happened', ...","['But', '%um', 'guessed', 'what', 'happened', ...","['O', 'O', 'O', 'O', 'O', 'O']","['O', 'O', 'O', 'O', 'O', 'O']"
1,['What ?'],"['What', '?']","['What', '?']","['O', 'O']","['O', 'O']"
2,['The %um'],"['The', '%um']","['The', '%um']","['O', 'O']","['O', 'O']"
3,['Again ?'],"['Again', '?']","['Again', '?']","['O', 'O']","['O', 'O']"
4,"[""118.91_120.82_B: Let 's see ,""]","['118.91_120.82_B:', 'Let', ""'s"", 'see', ',']","['118.91_120.82_B', ':', 'Let', ""'"", 's', 'see...","['O', 'O', 'O', 'O', 'O']","['O', 'O', 'O', 'O', 'O', 'O', 'O']"


In [75]:
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)

In [76]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [77]:
len(unequal)

338

In [78]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [79]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [80]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [81]:
len(unequal)

364

In [82]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [83]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [84]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [85]:
len(unequal)

338

In [86]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [87]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [88]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [89]:
len(unequal)

42

In [90]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [91]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [92]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [93]:
len(unequal)

42

In [94]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['118.91_120.82_B:', 'Let', "'s", 'see', ',']
['O', 'O', 'O', 'O', 'O']
detected
['118.91_120.82_B', ':', 'Let', "'s", 'see', ',']
['O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Very', 'ha-']
['O', 'O']
detected
['Very', 'ha', '-']
['O', 'O', 'O']
-----------------------------------------------------------
actual
['oh', '296.72_297.91_A:', 'oh', 'no', '.']
['O', 'O', 'O', 'O', 'O']
detected
['oh', '296.72_297.91_A', ':', 'oh', 'no', '.']
['O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['You', 'know', ',', 'I', 'mean', ',', 'we', "'re", 'd-', ',', 'you', 'know', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['You', 'know', ',', 'I', 'mean', ',', 'we', "'re", 'd', '-', ',', 'you', 'know', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['but', 'I', 'thi

In [95]:
for i in unequal[:10]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [96]:
for i in unequal[10:20]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['no', 'w-']
['O', 'O']
detected
['no', 'w', '-']
['O', 'O', 'O']
-----------------------------------------------------------
actual
['And', 'she', 'has', 't-', 'aw']
['O', 'O', 'O', 'O', 'O']
detected
['And', 'she', 'has', 't', '-', 'aw']
['O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['that', 'had', "n't", 'happ-', ',']
['O', 'O', 'O', 'O', 'O']
detected
['that', 'had', "n't", 'happ', '-', ',']
['O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Because', ',', 'I', 'r-']
['O', 'O', 'O', 'O']
detected
['Because', ',', 'I', 'r', '-']
['O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['oh', 'I', "'m", 'han-']
['O', 'O', 'O', 'O']
detected
['oh', 'I', "'m", 'han', '-']
['O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['So', 'it', "'s", 'just', ',', 'l-', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
det

In [97]:
for i in unequal[10:20]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [98]:
for i in unequal[20:30]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['actually', ',', 'I', "'m", 'not', 'sure', 'if', 'we', "'re", 'shooting', 'for', 'op-', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['actually', ',', 'I', "'m", 'not', 'sure', 'if', 'we', "'re", 'shooting', 'for', 'op', '-', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['I', 'mean', 'the', 'thing', 'that', 'i-']
['O', 'O', 'O', 'O', 'O', 'O']
detected
['I', 'mean', 'the', 'thing', 'that', 'i', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['it', 'has', 'an', 'off', 'board', 'power', 'supply', 'which', 'they', 'did', "n't", 'steal', ',', 'which', 'makes', 'the', 'thing', 'that', 'they', '**stoled**', 'absolutely', 'worthless', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['it', 'has', 'an', 'off', 'boar

In [99]:
for i in unequal[20:30]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [100]:
for i in unequal[30:]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['So', 'I', 'mean', 'any', 'income', 'that', 'y-']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['So', 'I', 'mean', 'any', 'income', 'that', 'y', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['I', 'had', 'Kans-', ',']
['O', 'O', 'O', 'O']
detected
['I', 'had', 'Kans', '-', ',']
['O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['I', "'ll", 'give', 'you', 'a', 'call', 'when', 'we', 'arrive', 'in', 'h-', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['I', "'ll", 'give', 'you', 'a', 'call', 'when', 'we', 'arrive', 'in', 'h', '-', ',']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['but', 'that', "'s", 'only', 'when', 'yo-', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['but', 'that', "'s", 'only', 'when', 'yo', '-', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O

In [101]:
for i in unequal[30:]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [102]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_tc_ner_perturb1_perturb1.csv
accuracy score
0.9790083586626139
0.7565337001375516


In [103]:
f = flist[2]
print(f)

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner_perturb1_perturb1.csv


In [104]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [105]:
len(unequal)

57

In [106]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [107]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [108]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [109]:
len(unequal)

14

In [110]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Jesus', 'answered', ',', 'You', 'must', 'not', 'murder', 'anyone', ',', 'you', 'must', 'not', 'commit', 'adultery', ',', 'you', 'must', 'not', 'steal', ',', 'you', 'must', 'not', 'tell', 'lies', 'about', 'others', ',', 'you', 'must', 'respect', 'your', 'father', 'and', 'mother', ',', 'and', "'", 'love', 'your', 'neighbor', 'the', 'same', 'as', 'you', 'love', 'yourself', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['Jesus', 'answered', ',', 'You', 'must', 'not', 'murder', 'anyone', ',', 'you', 'must', 'not', 'commit', 'adultery', ',', 'you', 'must', 'not', 'steal', ',', 'you', 'must', 'not', 'tell', 'lies', 'about', 'others', ',', 'you', 'must', 'respect', 'your', 'father', 'and', 'mother', ',', 'and', "'love", 'your', 'neighbor', 'the', 'same', 'as', 'you', 'love', 'you

In [111]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [112]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [113]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [114]:
len(unequal)

0

In [115]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_pt_ner_perturb1_perturb1.csv
accuracy score
0.9405382210506957
0.0


In [116]:
f = flist[3]

In [117]:
f

'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner_perturb1_perturb1.csv'

In [118]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [119]:
len(unequal)

308

In [120]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [121]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [122]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [123]:
len(unequal)

70

In [124]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Dodo', "'s", '1989', 'novel', '`', 'Soul', 'Mountain', "'", 'will', 'be', 'published', 'for', 'the', 'first', 'time', 'in', 'America', '.']
['B-PERSON', 'O', 'B-DATE', 'O', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'I-WORK_OF_ART', 'I-WORK_OF_ART', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O']
detected
['Dodo', "'s", '1989', 'novel', '`', 'Soul', 'Mountain', "'will", 'be', 'published', 'for', 'the', 'first', 'time', 'in', 'America', '.']
['B-PERSON', 'O', 'B-DATE', 'O', 'O', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O', 'O', 'O', 'O', 'O', 'B-ORDINAL', 'O', 'O', 'B-GPE', 'O']
-----------------------------------------------------------
actual
['This', 'novel', '`', 'soul', 'mountain', "'", 'which', 'is', 'just', 'being', 'released', 'in', 'the', 'United', 'States', 'this', 'week', 'is', 'in', 'every', 'possible', 'way', 'a', 'celebration', 'of', 'what', 'freedom', 'means', '.']
['O', 'O', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'I-WORK_OF_ART', 'I-WORK_OF_ART', 'O', 'O', 'O', 'O', 'O', 'O',

In [125]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [126]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [127]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [128]:
len(unequal)

61

In [129]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Former', 'U.S.', 'representative', 'Dodo', 'Dodo', 'has', 'died', '.']
['O', 'B-GPE', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O']
detected
['Former', 'U.S', '.', 'representative', 'Dodo', 'Dodo', 'has', 'died', '.']
['O', 'B-GPE', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Dodo', 'Dodo', 'was', 'first', 'elected', 'to', 'the', 'U.S.', 'house', 'in', '1948', ',', 'when', 'Dodo', 'Dodo', 'was', 'President', 'and', 'for', 'more', 'than', '4', 'decades', 'he', 'trounced', 'every', 'challenger', 'to', 'face', 'him', 'in', 'congressional', 'election', '.']
['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'B-DATE', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['Dodo', 'Dodo', 'was', 'first', 'elected', 'to', 'the', 'U.S', '.', 'house', 'in', '1948', ',', 'when', 'Dodo', 'Dodo', '

In [130]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [131]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [132]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [133]:
len(unequal)

1

In [134]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Good', 'to', 'be', '-LRB-', 'bleep', '-RRB-', 'King', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['Good', 'to', 'be', '-', 'LRB', '-', 'bleep', '-', 'RRB', '-', 'King', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------


In [135]:
for i in unequal:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [136]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bn_ner_perturb1_perturb1.csv
accuracy score
0.9727092149581138
0.8762080073630925


In [209]:
f = flist[4]

In [210]:
df = pd.read_csv(f)

In [211]:
df.head()

Unnamed: 0,Detected_sentence,Actual_ners,Detected_ners,Actual_ner_types,Detected_ner_types
0,"['basically , it was unanimously agreed upon b...","['basically', ',', 'it', 'was', 'unanimously',...","['basically', ',', 'it', 'was', 'unanimously',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"['To express its determination , the Chinese s...","['To', 'express', 'its', 'determination', ',',...","['To', 'express', 'its', 'determination', ',',...","['O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', ..."
2,['It takes time to prove whether the stock ref...,"['It', 'takes', 'time', 'to', 'prove', 'whethe...","['It', 'takes', 'time', 'to', 'prove', 'whethe...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,"['Dear viewers , the China News program will e...","['Dear', 'viewers', ',', 'the', 'China', 'News...","['Dear', 'viewers', ',', 'the', 'China', 'News...","['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O...","['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O..."
4,['This is Dodo Dodo .'],"['This', 'is', 'Dodo', 'Dodo', '.']","['This', 'is', 'Dodo', 'Dodo', '.']","['O', 'O', 'B-PERSON', 'I-PERSON', 'O']","['O', 'O', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']"


In [212]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [213]:
len(unequal)

1305

In [214]:
f

'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner_perturb1_perturb1.csv'

In [215]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [216]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [217]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [218]:
len(unequal)

1276

In [219]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [220]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [221]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [222]:
len(unequal)

1273

In [223]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [224]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [225]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [226]:
len(unequal)

1267

In [227]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [228]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [229]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [230]:
len(unequal)

1266

In [231]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Dealing', 'with', 'serial', 'crimes', 'per', 'se', '/.']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['Dealing', 'with', 'serial', 'crimes', 'per', 'se', '/', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['We', 'can', 'see', 'the', 'emerging', 'behavior', '/.']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['We', 'can', 'see', 'the', 'emerging', 'behavior', '/', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['we', 'can', 'see', 'that', 'this', 'is', 'a', 'behavior', 'that', 'is', 'not', 'going', 'to', 'stop', '/.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['we', 'can', 'see', 'that', 'this', 'is', 'a', 'behavior', 'that', 'is', 'not', 'going', 'to', 'stop', '/', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-------------------------------------------------------

In [232]:
for i in unequal:
    if df.iloc[i,:].Actual_ners[-1] == "/.":
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [233]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [234]:
len(unequal)

117

In [235]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Can', 'this', 'man', 'be', 'stopped', '/?']
['O', 'O', 'O', 'O', 'O', 'O']
detected
['Can', 'this', 'man', 'be', 'stopped', '/', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['What', 'is', 'the', 'Dodo', 'administration', 'going', 'to', 'do', 'about', 'it', '/?']
['O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['What', 'is', 'the', 'Dodo', 'administration', 'going', 'to', 'do', 'about', 'it', '/', '?']
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Will', 'the', 'United', 'States', 'insist', 'that', 'the', 'United', 'Nations', 'impose', 'sanctions', 'against', 'North', 'Korea', '/?']
['O', 'B-GPE', 'I-GPE', 'I-GPE', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O']
detected
['Will', 'the', 'United', 'States', 'insist', 'that', 'the', 'United', 'Nations', 'impose', 'sanctions', 'aga

In [236]:
for i in unequal:
    if df.iloc[i,:].Actual_ners[-1] == "/?":
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [237]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [238]:
len(unequal)

18

In [239]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['The', 'group', "'s", 'assessment', 'based', 'on', 'analysis', 'of', 'satellite', 'imagery', 'media', 'reports', 'and', 'statements', 'by', 'North', 'Korean', 'officials', 'which', 'lead', 'Dodo', 'Dodo', 'a', 'conservative', 'critic', 'from', 'the', 'American', 'Enterprise', 'Institute', 'these', 'are', 'not', 'Democrats', '/-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'I-NORP', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'B-NORP', 'O']
detected
['The', 'group', "'s", 'assessment', 'based', 'on', 'analysis', 'of', 'satellite', 'imagery', 'media', 'reports', 'and', 'statements', 'by', 'North', 'Korean', 'officials', 'which', 'lead', 'Dodo', 'Dodo', 'a', 'conservative', 'critic', 'from', 'the', 'American', 'Enterprise', 'Institute', 'these', 'are', 'not', 'Democrats', '/', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'I-NORP

In [240]:
#2,4,6,8
for i in [unequal[0],unequal[1],unequal[3],unequal[5],unequal[7],unequal[9]]:    
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [241]:
for i in unequal[10:]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['And', 'on', 'that', 'point', 'Dodo', 'on', 'that', 'point', 'she', 'says', 'sh-', '/-']
['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['And', 'on', 'that', 'point', 'Dodo', 'on', 'that', 'point', 'she', 'says', 'sh', '-', '/', '-']
['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['But', 'it', 'uh', 'is', 'very', 'very', 'you', 'know', 'eh', 'irre-', '/-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['But', 'it', 'uh', 'is', 'very', 'very', 'you', 'know', 'eh', 'irre', '-', '/', '-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['Sir', 'in', 'north', 'central', 'Iraq', 'voter', 'regis-', '/-']
['O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O']
detected
['Sir', 'in', 'north', 'central', 'Iraq', 'voter', 'regis', '-', '/', '-']
['O', 'O', 'O', 'O', 'B-G

In [242]:
for i in unequal[10:]:    
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [243]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [244]:
len(unequal)

8

In [245]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['but', 'I', 'think', 'it', "'s", 'essential', 'to', 'have', 'face', 'to', '*face', 'private', 'bilateral', 'talks', 'between', 'United', 'States', 'and', 'North', 'Korea', '/.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'I-GPE', 'O']
detected
['but', 'I', 'think', 'it', "'s", 'essential', 'to', 'have', 'face', 'to', '*', 'face', 'private', 'bilateral', 'talks', 'between', 'United', 'States', 'and', 'North', 'Korea', '/']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'I-GPE', 'O']
-----------------------------------------------------------
actual
['Among', 'the', 'journalistic', 'revelations,', 'Dodo', 'released', 'from', 'jail', 'last', 'month', 'says', 'she', 'pushed', 'for', 'a', 'story', 'on', 'the', 'outing', 'of', 'Dodo', 'Dodo', 'back', 'in', 'two', 'thousand', 'three', '/.']
['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-DATE', 'I-DATE'

In [246]:
for i in unequal:
    if i != unequal[1]:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [247]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [248]:
len(unequal)

1

In [249]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Among', 'the', 'journalistic', 'revelations,', 'Dodo', 'released', 'from', 'jail', 'last', 'month', 'says', 'she', 'pushed', 'for', 'a', 'story', 'on', 'the', 'outing', 'of', 'Dodo', 'Dodo', 'back', 'in', 'two', 'thousand', 'three', '/.']
['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O']
detected
['Among', 'the', 'journalistic', 'revelations', ',', 'Dodo', 'released', 'from', 'jail', 'last', 'month', 'says', 'she', 'pushed', 'for', 'a', 'story', 'on', 'the', 'outing', 'of', 'Dodo', 'Dodo', 'back', 'in', 'two', 'thousand', 'three', '/']
['O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE']
-----------------------------------------------------------


In [250]:
for i in unequal: 
    df.iloc[i,:].Detected_ners = ['Among', 'the', 'journalistic', 'revelations,', 'Dodo', 'released', 'from', 'jail', 'last', 'month', 'says', 'she', 'pushed', 'for', 'a', 'story', 'on', 'the', 'outing', 'of', 'Dodo', 'Dodo', 'back', 'in', 'two', 'thousand', 'three', '/']
    df.iloc[i,:].Detected_ner_types = ['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE']

In [251]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [252]:
len(unequal)

0

In [253]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_bc_ner_perturb1_perturb1.csv
accuracy score
0.9783017994382542
0.8336842105263158


In [254]:
f = flist[5]

In [255]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [256]:
f

'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner_perturb1_perturb1.csv'

In [257]:
len(unequal)

244

In [258]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [259]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [260]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [261]:
len(unequal)

142

In [262]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['The', 'Hebrew', 'channel', ',', 'I', 'meant', 'the', 'Al', '-', 'Arabiya', 'channel', ',', 'did', 'not', 'dare', ',', 'to', 'broadcast', 'this', 'film', 'or', 'even', 'mention', 'it', 'and', 'the', 'reasons', 'are', 'famously', 'unknown', '..']
['O', 'B-LANGUAGE', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['The', 'Hebrew', 'channel', ',', 'I', 'meant', 'the', 'Al', '-', 'Arabiya', 'channel', ',', 'did', 'not', 'dare', ',', 'to', 'broadcast', 'this', 'film', 'or', 'even', 'mention', 'it', 'and', 'the', 'reasons', 'are', 'famously', 'unknown', '.', '.']
['O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['This', 'is', 'a', 'unique', 'precedent', 'for', 'al', '-', 'Ja

In [263]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [264]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [265]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [266]:
len(unequal)

124

In [267]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [268]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [269]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [270]:
len(unequal)

105

In [271]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [272]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [273]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [274]:
len(unequal)

97

In [275]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [277]:
import re
def multicount_re(searchlist):
    temp = []
    for i,s in enumerate(searchlist):
        if len(re.findall(r'^[.][.]+',s)):
            temp.append(i)
    return temp

detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    ref_list = multicount_re(df.iloc[i,:].Detected_ners)
    detect_ner_temp = []
    detect_type_temp = []
    if len(ref_list) > 1:
        counter = 0
        while (counter<len(df.iloc[i,:].Detected_ners)):
            flag = 0
            while((counter<len(df.iloc[i,:].Detected_ners) - 1) and counter in ref_list):            
                counter += 1
                flag += 1
            if flag > 0:
                detect_ner_temp.append("...")
                detect_type_temp.append("O")
            while((counter<len(df.iloc[i,:].Detected_ners) - 1) and counter not in ref_list):
                detect_ner_temp.append(df.iloc[i,:].Detected_ners[counter])
                detect_type_temp.append(df.iloc[i,:].Detected_ner_types[counter])
                if (counter<len(df.iloc[i,:].Detected_ners) - 1):
                    counter += 1
            if counter == (len(df.iloc[i,:].Detected_ners) - 1):
                detect_ner_temp.append(df.iloc[i,:].Detected_ners[counter])
                detect_type_temp.append(df.iloc[i,:].Detected_ner_types[counter])
                detect_ner.append(detect_ner_temp)
                detect_type.append(detect_type_temp)
                break
    else:
        detect_ner.append(df.iloc[i,:].Detected_ners)
        detect_type.append(df.iloc[i,:].Detected_ner_types)

In [278]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [279]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [280]:
len(unequal)

85

In [281]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [282]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LRB","RRB","LCB","RCB","LSB","RSB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [283]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [284]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [285]:
len(unequal)

41

In [286]:
for i in unequal[:10]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages', '...']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['There', 'are', 'many', 'questions', ',', 'amongst', 'them', 'the', 'timing', 'of', 'its', 'being', 'shown', 'at', 'this', 'very', 'time', ',', 'following', 'the', 'conflict', 'in', 'interests', 'between', 'the', 'Sons', 'of', 'Monkeys', 'and', 'Pigs', 'and', 'the', 'Sons', 'of', 'Temporary', 'Marriages.', '..', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG

In [298]:
re.findall(r'^.[.]?.$','..Hfda')

[]

In [299]:
for i in unequal:
    if len(re.findall(r'^.[.]?.$',df.iloc[i,:].Detected_ners[-2])) > 0:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Detected_ners[:-1]
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Detected_ner_types[:-1]

In [300]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [301]:
len(unequal)

8

In [302]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['http://z08.zupload.com/download.php?...filepath=48993']
['O']
detected
['http://z08.zupload.com/download.php', '?..', '..']
['O', 'O', 'O']
-----------------------------------------------------------
actual
['It', "'s", 'the', 'most', 'disappointingly', '*cheesy*', '1960s', 'sci', '-', 'fi', 'kind', 'of', 'name', 'for', 'an', 'astral', 'entity', 'to', 'use', ',', 'that', "'s", 'for', 'sure', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['It', "'s", 'the', 'most', 'disappointingly', '*', 'cheesy', '*', '1960s', 'sci', '-', 'fi', 'kind', 'of', 'name', 'for', 'an', 'astral', 'entity', 'to', 'use', ',', 'that', "'s", 'for', 'sure', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['The', 'way', 'I', 'work', 'it', "'s", 'g

In [303]:
for i in unequal:
    if i != unequal[6]:
        df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
        df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [304]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [305]:
len(unequal)

1

In [306]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['MSN', ':', 'p...@hotmail.com']
['O', 'O', 'O']
detected
['MSN', ':', 'p', '...']
['B-ORG', 'O', 'O', 'O']
-----------------------------------------------------------


In [307]:
i = unequal[0]
df.iloc[i,:].Detected_ners = ['MSN', ':', 'p']
df.iloc[i,:].Detected_ner_types = ['B-ORG', 'O', 'O']

In [308]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_wb_ner_perturb1_perturb1.csv
accuracy score
0.9589107056872291
0.7434362045140488


In [310]:
f = flist[6]

In [311]:
f

'/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner_perturb1_perturb1.csv'

In [312]:
df = pd.read_csv(f)
df[['Detected_ners']] = df[['Detected_ners']].applymap(yaml.safe_load) 
df[['Detected_ner_types']] = df[['Detected_ner_types']].applymap(yaml.safe_load) 
df[['Actual_ners']] = df[['Actual_ners']].applymap(yaml.safe_load) 
df[['Actual_ner_types']] = df[['Actual_ner_types']].applymap(yaml.safe_load)
  

unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [313]:
len(unequal)

740

In [314]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 1) and d == "'"):
            detect_ner_temp.append("'" + df.iloc[i,:].Detected_ners[j+1])
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [315]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [316]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [317]:
len(unequal)

449

In [318]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2)) and df.iloc[i,:].Detected_ners[j+1] == ".":
            detect_ner_temp.append(d + ".")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [319]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [320]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [321]:
len(unequal)

134

In [323]:
for i in unequal[:5]:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['Mrs.', 'Dodo', "'", 'first', 'trip', 'to', 'Japan', 'as', 'America', "'s", 'chief', 'trade', 'negotiator', 'had', 'a', 'completely', 'different', 'tone', 'from', 'last', 'month', "'s", 'visit', 'by', 'Commerce', 'Secretary', 'Dodo', 'Dodo', 'Dodo', '.']
['O', 'B-PERSON', 'O', 'B-ORDINAL', 'O', 'O', 'B-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'B-ORG', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O']
detected
['Mrs.', 'Dodo', "'first", 'trip', 'to', 'Japan', 'as', 'America', "'s", 'chief', 'trade', 'negotiator', 'had', 'a', 'completely', 'different', 'tone', 'from', 'last', 'month', "'s", 'visit', 'by', 'Commerce', 'Secretary', 'Dodo', 'Dodo', 'Dodo', '.']
['O', 'B-PERSON', 'O', 'O', 'O', 'B-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'B-ORG', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O']
-----------------------------------------------------------
a

In [324]:
detect_ner = []
detect_type = []

for i in range(df.shape[0]):
    detect_ner_temp = []
    detect_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Detected_ners):
        if (j < (len(df.iloc[i,:].Detected_ners) - 2) and d == "-" and df.iloc[i,:].Detected_ners[j+1] in ["LRB","RRB","LCB","RCB","LSB","RSB"]):
            detect_ner_temp.append("-" + df.iloc[i,:].Detected_ners[j+1] + "-")
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 2
            continue
        elif flag == 2:
            flag = 0
            continue   
        else:
            detect_ner_temp.append(d)
            detect_type_temp.append(df.iloc[i,:].Detected_ner_types[j])
    detect_ner.append(detect_ner_temp)
    detect_type.append(detect_type_temp)

In [325]:
df.Detected_ners = detect_ner
df.Detected_ner_types = detect_type

In [326]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [327]:
len(unequal)

56

In [328]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 1) and d == "'"):
            actual_ner_temp.append("'" + df.iloc[i,:].Actual_ners[j+1])
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j+1])
            flag = 1
        elif flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [329]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [330]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [331]:
len(unequal)

27

In [332]:
actual_ner = []
actual_type = []

for i in range(df.shape[0]):
    actual_ner_temp = []
    actual_type_temp = []
    flag = 0
    for j,d in enumerate(df.iloc[i,:].Actual_ners):
        if (j < (len(df.iloc[i,:].Actual_ners) - 2)) and df.iloc[i,:].Actual_ners[j+1] == ".":
            actual_ner_temp.append(d + ".")
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
            flag = 1
        elif d == "." and flag == 1:
            flag = 0
            continue
        else:
            actual_ner_temp.append(d)
            actual_type_temp.append(df.iloc[i,:].Actual_ner_types[j])
    actual_ner.append(actual_ner_temp)
    actual_type.append(actual_type_temp)

In [333]:
df.Actual_ners = actual_ner
df.Actual_ner_types = actual_type

In [334]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [335]:
len(unequal)

8

In [336]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment', '...', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
detected
['She', 'added', 'that', 'she', 'expected', '``', 'perhaps', 'to', 'have', 'a', 'down', 'payment.', '..', '..', 'some', 'small', 'step', 'to', 'convince', 'the', 'American', 'people', 'and', 'the', 'Japanese', 'people', 'that', 'we', "'re", 'moving', 'in', 'earnest', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-----------------------------------------------------------
actual
['My', 'colleagues', 'and', 'I

In [338]:
for i in unequal[:-1]:
    df.iloc[i,:].Detected_ners = df.iloc[i,:].Actual_ners
    df.iloc[i,:].Detected_ner_types = df.iloc[i,:].Actual_ner_types

In [339]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [340]:
len(unequal)

1

In [341]:
for i in unequal:
    print("actual")
    print(df.iloc[i,:].Actual_ners)
    print(df.iloc[i,:].Actual_ner_types)
    print("detected")
    print(df.iloc[i,:].Detected_ners)
    print(df.iloc[i,:].Detected_ner_types)
    print("-----------------------------------------------------------")

actual
['A', 'month', 'ago', ',', 'Hertz', ',', 'of', 'Park', 'Ridge', ',', 'N.J.', ',', 'said', 'that', 'it', 'would', 'drop', 'its', 'marketing', 'agreements', 'at', 'year', 'end', 'with', 'Delta', ',', 'America', 'West', 'and', 'Texas', 'Air', 'Corp.', "'s", 'Continental', 'Airlines', 'and', 'Eastern', 'Airlines', ',', 'and', 'that', 'pacts', 'with', 'American', 'Airlines', ',', 'UAL', 'Inc', "'s", 'United', 'Airlines', 'and', 'USAir', 'also', 'would', 'be', 'ended', '...', 'sometime', 'after', 'Dec.', '31', '.']
['B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-ORG', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O']
detected
['A', 'month', 'ago', ',',

In [342]:
for i in unequal:
   
    df.iloc[i,:].Detected_ners = ['A', 'month', 'ago', ',', 'Hertz', ',', 'of', 'Park', 'Ridge', ',', 'N.J.', ',', 'said', 'that', 'it', 'would', 'drop', 'its', 'marketing', 'agreements', 'at', 'year', 'end', 'with', 'Delta', ',', 'America', 'West', 'and', 'Texas', 'Air', 'Corp.', "'s", 'Continental', 'Airlines', 'and', 'Eastern', 'Airlines', ',', 'and', 'that', 'pacts', 'with', 'American', 'Airlines', ',', 'UAL', 'Inc', "'s", 'United', 'Airlines', 'and', 'USAir', 'also', 'would', 'be', 'ended.', '...', 'sometime', 'after', 'Dec.', '31', '.']
    df.iloc[i,:].Detected_ner_types = ['B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-ORG', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'B-ORG', 'I-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O']

In [343]:
unequal = []
for i in range(df.shape[0]):
    if len(df.iloc[i,:].Actual_ner_types) != len(df.iloc[i,:].Detected_ner_types):
        unequal.append(i)

In [344]:
len(unequal)

0

In [345]:
print(f)
print("accuracy score")
print(accuracy_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))
print(f1_score(df.Actual_ner_types.tolist(),df.Detected_ner_types.tolist()))

/Users/ramybal/Desktop/untitled folder/neuroner/detect_onto_nw_ner_perturb1_perturb1.csv
accuracy score
0.9737996663165049
0.8891021539773939
