In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Load file with prediction errors from the model
ers = pd.read_csv('/Users/jason/svn/flair-custom/resume2/pred_errors.txt', sep='\t')

In [3]:
ers.head()

Unnamed: 0.1,Unnamed: 0,text,gt,pred
0,10,Muslim,B-geo,B-org
1,187,90,B-tim,O
2,191,current,B-tim,O
3,192,fiscal,I-tim,B-tim
4,322,Saudi,B-org,O


In [4]:
# Remove the location prefix (B-, I-, S-, etc) from the tags
ers['gt_trim'] = ers['gt'].apply(lambda w: re.sub('[A-Z]\-', '', w))
ers['pred_trim'] = ers['pred'].apply(lambda w: re.sub('[A-Z]\-', '', w))

In [5]:
ers['gt_trim'].unique()

array(['geo', 'tim', 'org', 'per', 'O', 'gpe', 'art', 'nat', 'eve'],
      dtype=object)

In [7]:
# Create a dataframe containing the observations
# where the base tag does not match (i.e. ignore I-org vs B-org)
ers_miss = ers[~(ers['gt_trim'] == ers['pred_trim'])]

In [9]:
ers_miss.shape

(2430, 6)

In [10]:
# Find the percentage of errors by Ground Truth base tag
100 * (ers_miss['gt_trim'].value_counts() / ers_miss.shape[0])

org    32.263374
O      20.781893
geo    15.473251
tim    11.934156
per    11.851852
gpe     3.251029
art     2.427984
eve     1.481481
nat     0.534979
Name: gt_trim, dtype: float64

The above cell tells us that almost 1/3 of our errors are happening when the ground truth tag is 'org'. Let's look into this.

In [11]:
# Randomly sample 10 errors where the ground truth tag is 'org'
tag = 'org'
ers_miss[ers_miss['gt_trim'] == tag].sample(10, random_state=777)

Unnamed: 0.1,Unnamed: 0,text,gt,pred,gt_trim,pred_trim
871,31805,Morocco,B-org,B-geo,org,geo
1648,59949,Gucht,I-org,I-per,org,per
1562,56743,Sweden,B-org,B-geo,org,geo
1797,66088,Ghraib,I-org,I-geo,org,geo
1054,39082,Berri,I-org,I-per,org,per
127,4285,International,I-org,O,org,O
755,27178,Abu,B-org,B-per,org,per
2518,90375,Babaker,B-org,B-per,org,per
2510,90096,Cerkez,B-org,B-per,org,per
2636,93411,Roman,B-org,B-per,org,per


In [13]:
test = pd.read_csv('/Users/jason/svn/flair-custom/utf8/ner_test_utf8.txt', sep='\t', header=None)

When we look at the context for the first discrepency below, it appears that our model made a better choice than the ground truth test data provided.

In [15]:
# Find the context surrounding the tag with the discrepency
test.iloc[31800:31814]

Unnamed: 0,0,1
31800,in,O
31801,1957,B-tim
31802,assumed,O
31803,the,O
31804,title,O
31805,of,O
31806,king,O
31807,Morocco,B-org
31808,annexed,O
31809,Western,B-geo
