### Data retrieval

In [1]:
!curl https://raw.githubusercontent.com/andabi/deep-text-corrector/master/data/conll14st-test-data/noalt/official-2014.combined.m2 \
    --output official-2014.combined.m2

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  428k  100  428k    0     0   588k      0 --:--:-- --:--:-- --:--:--  587k


In [2]:
!head -n 20 official-2014.combined.m2

S Keeping the Secret of Genetic Testing

S What is genetic risk ?

S Genetic risk refers more to your chance of inheriting a disorder or disease .
A 3 4|||ArtOrDet||||||REQUIRED|||-NONE-|||0

S People get certain disease because of genetic changes .
A 3 4|||Nn|||diseases|||REQUIRED|||-NONE-|||0

S How much a genetic change tells us about your chance of developing a disorder is not always clear .

S If your genetic results indicate that you have gene changes associated with an increased risk of heart disease , it does not mean that you definitely will develop heart disease .

S The opposite is also true .

S If your genetic results show that you do not have changes associated with an increased risk of heart disease , it is still possible that you develop heart disease .
A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0
A 26 26|||Vt|||will|||REQUIRED|||-NONE-|||1



### Agreement calculation algorithm

In [10]:
from collections import namedtuple
from collections import defaultdict

Correction = namedtuple('Correction',\
                        'sentence_id, start_offset end_offset error_type edits annotator_id agree disagree')
Sentence = namedtuple('Sentence', 'text tokens_num')

def load_m2(filename):
    """
        Parses m2 file. 
        Returns containing sentences, annotators, corrections and corrections index by sentence id
    """
    sentence_seq = -1
    tokens_num = -1
    sentences = []    
    corrections = []
    corrections_index = defaultdict(list)
    annotators = set([])

    with open(filename) as m2:
        for line in m2:
            if line.startswith('S '):
                sentence_seq += 1
                sentence = line[2:].strip()
                tokens_num = len(sentence.split(' '))
                sentences.append(Sentence(text = sentence, tokens_num = tokens_num))
            elif line.startswith('A '):
                fields = line[2:].strip().split('|||')            
                offsets = [int(offset.strip())for offset in fields[0].split(' ')]
                assert len(fields[2].split('||')) == 1                             
                correction = Correction(sentence_id = sentence_seq,\
                                       start_offset = offsets[0] if offsets[0] != -1 else 0,\
                                       end_offset = offsets[1] if offsets[1] != -1 else tokens_num,\
                                       error_type = fields[1].strip(),\
                                       edits = fields[2].strip(),\
                                       annotator_id = fields[5].strip(),\
                                       agree = set([fields[5].strip()]),\
                                       disagree = set([]))
                annotators.add(correction.annotator_id)
                corrections.append(correction)
                corrections_index[sentence_seq].append(len(corrections) - 1)
    return sentences, annotators, corrections, corrections_index

def normalize_agreement(sentence_id, corrections_index, corrections):
    """
        Makes a sentence corrections normalization.
        Normalization is done by adding 'noop' if annotator hasn't annotated the sentence at all.
        Thus we implicitly assume that annotator agrees on no errors present.
    """
    sentence_annotators = { corrections[i].annotator_id for i in corrections_index[sentence_id] }
    for annotator_id in annotators.difference(sentence_annotators):
        noop = Correction(sentence_id, start_offset = 0,\
                          end_offset = sentences[sentence_id].tokens_num,\
                          error_type = 'noop',\
                          edits = '-NONE-',\
                          annotator_id = annotator_id,\
                          agree = set({annotator_id}),\
                          disagree = set([])) 
        corrections.append(noop)
        corrections_index[sentence_id].append(len(corrections) - 1)
        
              
def range_intersects(rx, ry):
    """Checks whether two ranges intersect, start offset is inclusive, end offset is exclusive"""
    return rx.end_offset >= ry.start_offset and rx.start_offset <= ry.end_offset

def range_equals(rx, ry):
    """Checks ranges equality"""
    return rx.start_offset == ry.start_offset and rx.end_offset == ry.end_offset

def process_sentence_agreement(annotators, index, corrections):
    """Processes annotators agreement of a sentence"""
    for i in index:
        corrections[i].disagree.update(annotators.difference(corrections[i].agree)) 
    for i in index:        
        x = corrections[i]
        for j in index:
            y = corrections[j]
            if x.annotator_id != y.annotator_id and range_intersects(x,y):
                if range_equals(x,y) and x.error_type == y.error_type and x.edits == y.edits:
                    corrections[i].disagree.discard(y.annotator_id)
                    corrections[i].agree.add(y.annotator_id)   
                    
def process_agreement(sentences, annotators, corrections, corrections_index):
    """Processes annotators agreeement for all text"""
    for sentence_id in range(len(sentences)):
        normalize_agreement(sentence_id, corrections_index, corrections)                    

    for index in corrections_index.values():
        process_sentence_agreement(annotators, index, corrections)    

### Processing the data

In [5]:
data = load_m2('official-2014.combined.m2') 

sentences, annotators, corrections, corrections_index = data

process_agreement(*data)

### Statistics calculation

We use no sofisticated metrics, just simple pecentile __agreement = annotators agreed/annotators_total__

In [6]:
import pandas as pd

corr_df = pd.DataFrame(corrections).sort_values(by=['sentence_id', 'annotator_id', 'start_offset'])

corr_df.head(50)

Unnamed: 0,sentence_id,start_offset,end_offset,error_type,edits,annotator_id,agree,disagree
5874,0,0,6,noop,-NONE-,0,"{1, 0}",{}
5873,0,0,6,noop,-NONE-,1,"{1, 0}",{}
5876,1,0,5,noop,-NONE-,0,"{1, 0}",{}
5875,1,0,5,noop,-NONE-,1,"{1, 0}",{}
0,2,3,4,ArtOrDet,,0,{0},{1}
5877,2,0,14,noop,-NONE-,1,{1},{0}
1,3,3,4,Nn,diseases,0,{0},{1}
5878,3,0,9,noop,-NONE-,1,{1},{0}
5880,4,0,19,noop,-NONE-,0,"{1, 0}",{}
5879,4,0,19,noop,-NONE-,1,"{1, 0}",{}


#### Genral aggreement

In [7]:
general_stats = corr_df[['agree', 'disagree']].applymap(len).apply(sum)

general_stats

agree       7773
disagree    4693
dtype: int64

In [8]:
general_stats.get('agree')/(general_stats.get('agree') + general_stats.get('disagree')) 

0.6235360179688754

#### Agreement by error type

In [9]:
error_stats = corr_df[['error_type','agree', 'disagree']]\
.groupby(by=['error_type'])\
.agg(lambda votes: sum(map(len, votes)))\

error_stats['agreement'] = error_stats.agg(lambda row: row[0]/(row[0] + row[1]), axis = 1)

error_stats

Unnamed: 0_level_0,agree,disagree,agreement
error_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ArtOrDet,1052,460,0.695767
Mec,850,566,0.600282
Nn,601,245,0.710402
Npos,44,24,0.647059
Others,99,87,0.532258
Pform,75,55,0.576923
Pref,267,223,0.544898
Prep,769,409,0.652801
Rloc-,309,269,0.534602
SVA,346,154,0.692
