In [28]:
import os

import pandas as pd
from lxml import etree

In [29]:
data_path = '/Users/victor/Downloads/conll14st-test-data/noalt/'
alt_files = [x for x in os.listdir(data_path) if x.endswith('sgml')]

In [30]:
class DataSource(object):
    
    def __init__(self, file_path): 
        self.started = False
        self.stoped = False
        self.file_path = file_path
    
    def read(self, requested_size=0):
        if self.stoped:
            self.file.close()
            return b''
        
        if not self.started: 
            self.started = True
            self.file = open(self.file_path, 'r')
            return b'<ROOT>'
        else: 
            try: 
                return next(self.file).encode('utf-8')
            except StopIteration:
                self.stoped = True
                return b'</ROOT>'

In [31]:
def xml_to_dataframe(list_of_files): 
    all_corrections = []

    for file_path in list_of_files: 
        ds = DataSource(os.path.join(data_path, file_path))

        current_doc_id = None
        current_teacher = None
        current_mistake = dict()
        for event, element in etree.iterparse(ds, events=('start', 'end')): 
            if event == 'start':     
                if element.tag == 'DOC':
                    current_doc_id = element.attrib['nid']
                elif element.tag == 'ANNOTATION':
                    current_teacher = element.attrib['teacher_id']
                elif element.tag == 'MISTAKE': 
                    current_mistake['start_par'] = element.attrib['start_par']
                    current_mistake['end_par'] = element.attrib['end_par']
                    current_mistake['start_off'] = element.attrib['start_off']
                    current_mistake['end_off'] = element.attrib['end_off']
                elif element.tag == 'TYPE':
                    current_mistake['type'] = element.text
                elif element.tag == 'CORRECTION': 
                    current_mistake['correction'] = element.text
            elif event == 'end': 
                if element.tag == 'MISTAKE':
                    all_corrections.append(
                        (current_doc_id, current_teacher, 
                         current_mistake['start_par'], current_mistake['end_par'],
                         current_mistake['start_off'], current_mistake['end_off'], 
                         current_mistake['type'], current_mistake['correction'])
                    )
                    current_mistake = dict()
                    
    return pd.DataFrame(all_corrections, columns=['doc_id', 'teacher_id', 'start_par', 'end_par', 
                                                  'start_off', 'end_off', 'type', 'correction'])

In [32]:
df = xml_to_dataframe(alt_files)

In [49]:
def get_full_agreement(s):
    """ Calculates inter-annotator agreement for full match of mistakes
    (all attributes like start/end offsets, type, correction etc. should match 100%).

    Calculated as:

        result = len(correction_A intersection correction_B) / len(correction_A union correction_B)

    Parameters
    ----------
    s : list of set
        list of corrections for each teacher

    Returns
    -------
    agreement value (from 0 to 1)
    """
    if len(s) <= 1:
        return (0, 0, 0)
    else:
        all_c = set()
        for x in s:
            all_c.update(x)

        intersection = s[0]
        for x in s[1:]:
            intersection = intersection & x
        common_len = len(intersection)
        total_len = len(all_c)
        return total_len, common_len, common_len / float(total_len)


In [50]:
df['hash'] = df.apply(
        lambda x: '{}:{}:{}:{}:{}:{}'.format(x['doc_id'], x['start_par'],
                                             x['end_par'], x['start_off'],
                                             x['end_off'], x['correction']), axis=1)
new_df = df.groupby(['type', 'teacher_id'])['hash'].agg(lambda x: set(x.tolist()))
new_df = new_df.groupby(['type']).agg(lambda x: get_full_agreement(x.tolist()))
result = pd.DataFrame({'error_type': new_df.index.tolist(), 'agreement': new_df.tolist()})
result['total_len'] = result['agreement'].apply(lambda x: print(type(x)))

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [47]:
result


Unnamed: 0,agreement,error_type
0,"(638, 138, 0.21630094043887146)",ArtOrDet
1,"(648, 79, 0.12191358024691358)",Mec
2,"(339, 104, 0.30678466076696165)",Nn
3,"(29, 5, 0.1724137931034483)",Npos
4,"(90, 3, 0.03333333333333333)",Others
5,"(60, 5, 0.08333333333333333)",Pform
6,"(237, 12, 0.05063291139240506)",Pref
7,"(508, 93, 0.1830708661417323)",Prep
8,"(284, 10, 0.035211267605633804)",Rloc-
9,"(202, 57, 0.28217821782178215)",SVA
