In [281]:
import pandas as pd
import json
from math import isnan

In [414]:
df = pd.read_csv('TOLOKA_RESULTS.tsv', sep='\t')
df = df[
    [
        'INPUT:input', 
        'OUTPUT:connections', 
        'OUTPUT:no_relations', 
        'OUTPUT:incorrect_markup',
        'GOLDEN:connections',
        'GOLDEN:no_relations',
        'GOLDEN:incorrect_markup',
        'ASSIGNMENT:worker_id'
    ]
]

In [415]:
df['OUTPUT:connections'] = df['OUTPUT:connections'].apply(lambda x: eval(x) if type(x) != float else x)
df['GOLDEN:connections'] = df['GOLDEN:connections'].apply(lambda x: eval(x) if type(x) != float else x)

control_df = df[df['GOLDEN:connections'].notna()].drop_duplicates(subset=['INPUT:input'])
res_df = df[df['GOLDEN:connections'].apply(lambda x: type(x) == float)]

# Filtering annotators by their performance

In [416]:
# Preparing a data frame

ann_df = df[df['GOLDEN:connections'].notna()] \
         [['OUTPUT:connections', 'GOLDEN:connections', 'ASSIGNMENT:worker_id']]

ann_df = (
    ann_df
    .groupby('ASSIGNMENT:worker_id')
    .agg(
        {'OUTPUT:connections': lambda x: list(x),
         'GOLDEN:connections': lambda x: list(x)}
    )
    .reset_index()
)

ann_df['GOLDEN:number'] = ann_df['GOLDEN:connections'].apply(len)

In [417]:
ann_df.head(3)

Unnamed: 0,ASSIGNMENT:worker_id,OUTPUT:connections,GOLDEN:connections,GOLDEN:number
0,067901a253b8039272920ee9871312e4,"[[{'T1': {'value': 'per', 'text': 'Юра Гершков...","[[{'T1': {'value': 'per', 'text': 'Юра Гершков...",24
1,0f891e4bff332c6c531b140c56cf6aa0,"[[{'T1': {'value': 'per', 'text': 'Лев Михайло...","[[{'T1': {'value': 'per', 'text': 'Лев Михайло...",18
2,0fa81c631c7ddf6866d0e74b445def47,"[[{'T1': {'value': 'per', 'text': 'Юлия Петров...","[[{'T1': {'value': 'per', 'text': 'Юлия Петров...",10


In [418]:
def dict_to_tuple(input_dict: list) -> list:
    '''
    The fuction converts a dict output 
    into a list of tuples leaving only 
    the following fields: t1.text, t2.text
    and connection_type.    
    '''
    
    return sorted([(i['T1']['text'], i['T2']['text'], i['connection_type']) for i in input_dict])


def find_matching(outputs: list, goldens: list) -> float:
    '''
    The fuction matches two lists of tuples
    (output of an annotator and a golden
    markup), compares them and then calculates
    the accuracy of an annotator's markup
    ''' 
    
    num_matches = 0
    num_all = 0
    for output, golden in zip(outputs, goldens):
        try:
            output = dict_to_tuple(output)
            golden  = dict_to_tuple(golden)
        except Exception as e:
            num_all += 1
            continue
        if output == golden:
            num_matches += 1
        num_all += 1
    return round(num_matches / num_all, 2)

In [419]:
ann_df['WORKER:quality'] = ann_df.apply(lambda row: find_matching(row['OUTPUT:connections'], row['GOLDEN:connections']), axis=1)

In [420]:
ann_df

Unnamed: 0,ASSIGNMENT:worker_id,OUTPUT:connections,GOLDEN:connections,GOLDEN:number,WORKER:quality
0,067901a253b8039272920ee9871312e4,"[[{'T1': {'value': 'per', 'text': 'Юра Гершков...","[[{'T1': {'value': 'per', 'text': 'Юра Гершков...",24,0.79
1,0f891e4bff332c6c531b140c56cf6aa0,"[[{'T1': {'value': 'per', 'text': 'Лев Михайло...","[[{'T1': {'value': 'per', 'text': 'Лев Михайло...",18,0.89
2,0fa81c631c7ddf6866d0e74b445def47,"[[{'T1': {'value': 'per', 'text': 'Юлия Петров...","[[{'T1': {'value': 'per', 'text': 'Юлия Петров...",10,0.50
3,14faac6951518d2ac0e04d5c1ed1fca4,"[[{'T1': {'value': 'per', 'text': 'Дима', 'sta...","[[{'T1': {'value': 'per', 'text': 'Дима', 'sta...",1,0.00
4,15668ee15ff59306c744085d68d15c6f,[nan],"[[{'T1': {'value': 'per', 'text': 'В . В . Щер...",1,0.00
...,...,...,...,...,...
149,faf2179c9bbae256e55277c7eeefbd86,"[[{'T1': {'value': 'per', 'text': 'Ширак', 'st...","[[{'T1': {'value': 'per', 'text': 'Ширак', 'st...",2,0.50
150,fbff039d8f428b06be03d0dbb7244c90,"[[{'T1': {'value': 'per', 'text': 'Л . Киселёв...","[[{'T1': {'value': 'per', 'text': 'Л . Киселёв...",19,0.84
151,fc0539a3c5994048f03b0cf55496e19e,"[[{'T2': {'color': 'grey', 'text': 'писатель',...","[[{'T1': {'value': 'per', 'text': 'Хью Лофтинг...",1,0.00
152,fc4e5a8a9c869a7cb40aebb49660d838,"[[{'T1': {'value': 'per', 'text': 'Кристиной',...","[[{'T1': {'value': 'per', 'text': 'Кристиной',...",2,1.00


# Finding most-common responses

In [421]:
res_df = res_df.join(ann_df[['ASSIGNMENT:worker_id', 'WORKER:quality']].set_index('ASSIGNMENT:worker_id'), on='ASSIGNMENT:worker_id')

In [424]:
res_df = res_df.drop(
    columns=[
        'OUTPUT:no_relations', 
        'OUTPUT:incorrect_markup', 
        'GOLDEN:incorrect_markup', 
        'GOLDEN:no_relations',
        'GOLDEN:connections'
    ]
)

In [425]:
res_df

Unnamed: 0,INPUT:input,OUTPUT:connections,ASSIGNMENT:worker_id,WORKER:quality
2,"Павлов , жена — Надя .","[{'T1': {'value': 'per', 'text': 'Павлов', 'st...",3c338f6812fef4da6d871f2cca3b72ef,0.87
3,"О . Виктор — американец по рождению , как и ег...","[{'T1': {'value': 'per', 'text': 'О . Виктор',...",3c338f6812fef4da6d871f2cca3b72ef,0.87
4,« Тайный » ( по всем прошлым понятиям ) доклад...,,3c338f6812fef4da6d871f2cca3b72ef,0.87
5,Сектор установил связи с учеными академических...,,3c338f6812fef4da6d871f2cca3b72ef,0.87
6,"Он бы улыбнулся , если бы смог увидеть , как с...","[{'T1': {'value': 'per', 'text': 'Горбачев', '...",3c338f6812fef4da6d871f2cca3b72ef,0.87
...,...,...,...,...
3984,"Профессиональные , старые театры закрываются ,...","[{'T1': {'value': 'fac', 'color': 'yellow', 's...",b5d97fb811f108f070ffda59491c0a1f,0.90
3985,"Оказывается , секретарь ЦК ДОМА , который приб...",[],b5d97fb811f108f070ffda59491c0a1f,0.90
3986,На праздничном ужине сидел напротив М . Перель...,"[{'T1': {'value': 'per', 'end': '49', 'color':...",b5d97fb811f108f070ffda59491c0a1f,0.90
3987,> Он меняться не хотел . Это я о Володе сочини...,,b5d97fb811f108f070ffda59491c0a1f,0.90


In [447]:
qualified_annotators = ann_df[
    (ann_df['WORKER:quality'] > 0.7) & 
    (ann_df['GOLDEN:number'] > 1)
]['ASSIGNMENT:worker_id'].to_list()

In [448]:
print('Number of qualified annotators:', len(qualified_annotators))

Number of qualified annotators: 51


In [537]:
grouped_df = res_df.groupby('INPUT:input')
grouped_df = (grouped_df
 .agg(
        {'OUTPUT:connections': lambda x: list(x),
         'WORKER:quality': lambda x: list(x)}
    )
 .reset_index()
)

In [708]:
def aggregate(annotations: list, weights: list) -> tuple:
    
    # if for some season there is only 1 annotaion
    if (len(annotations) == 1) and ((type(annotations[0]) == float) or (annotations[0] == [])):
            return 'no_relations'

    # if there' s more than 1 annotation
    else:
        input_data = []
        c = Counter()
        
        for ann, weight in zip(annotations, weights):
            
            # if annotation is not 'NaN'
            if (type(ann) != float) and (ann != []):
                ann_tuple = tuple([(tuple(i['T1'].values()), tuple(i['T2'].values()), i['connection_type']) for i in ann])
                input_data.append([ann_tuple, weight])
                
            # if annotation is 'NaN' or []
            elif (type(ann) == float) or (ann == []):
                input_data.append(['no_relations', weight])
                
        for k, v in input_data:
            c.update({k: v})
        return c.most_common(1)[0][0]

In [709]:
grouped_df['OUTPUT:answer'] = grouped_df.apply(lambda row: aggregate(row['OUTPUT:connections'], row['WORKER:quality']), axis=1)

In [736]:
def tuple_to_dict(all_anns: tuple) -> list:
    dict_anns = []
    
    for ann in all_anns:
        dict_ann = []
        t1 = ann[0]
        t2 = ann[1]
        conn = ann[2]
        
        t1_values = {'value': t1[0], 'text': t1[1], 'start': t1[2], 'end': t1[3], 'color': t1[4]}
        t2_values = {'value': t2[0], 'text': t2[1], 'start': t2[2], 'end': t2[3], 'color': t2[4]}
        
        dict_anns.append({'T1': t1_values, 'T2': t2_values, 'connection_type': conn})
    return dict_anns

In [750]:
grouped_df['OUTPUT:answer'] = grouped_df['OUTPUT:answer'].apply(lambda x: tuple_to_dict(x) if x != 'no_relations' else {})

In [765]:
toloka = grouped_df[['INPUT:input', 'OUTPUT:answer']]
golden = control_df[['INPUT:input', 'GOLDEN:connections']].rename(columns={'GOLDEN:connections': 'OUTPUT:answer'})

In [767]:
(
    pd
    .concat([toloka, golden])
    .to_csv('TOLOKA_RESULTS.tsv', sep='\t', index=False)
)