In [9]:
import os
import pandas as pd
from nltk.metrics.agreement import AnnotationTask
import krippendorff
import numpy as np
from collections import Counter
import re

In [2]:
def get_unique_texts_from_df(path):
    df = pd.read_csv(path, sep = '\t')
    df = df[df['GOLDEN:result'].isna()]
    df.dropna(subset = ['INPUT:text'], inplace = True)
    return df, set(df['INPUT:text'])


In [6]:
def calculate_krippendorf_from_raw_df_nltk(df):
    coolected_data = []
    collected_texts = []
    named_collected_data_dict = {}
    
    text2ids = {}
    for t in set(df['INPUT:text'].tolist()):
        if t not in text2ids:
            text2ids[t] = len(text2ids)
    df['text_idx'] = df['INPUT:text'].map(text2ids)
    ids2text = {idx:text for text,idx in text2ids.items()}
    
    worker2id = {}
    for pair in df['ASSIGNMENT:worker_id'].tolist():
        if pair not in worker2id:
            worker2id[pair] = len(worker2id)
    df['worker_idx'] = df['ASSIGNMENT:worker_id'].map(worker2id)
    
    for i,el in df.iterrows():
        coder_idx = el['worker_idx']
        item_idx = el['text_idx']
        label = el['OUTPUT:result']
        
        collected_triplet = (coder_idx,item_idx,label )
        coolected_data.append(collected_triplet)
        
#         data_curr = [el['INPUT:text'],el['OUTPUT:result'],el['CONFIDENCE:result']]
        collected_texts.append(el['INPUT:text'])
    
    t = AnnotationTask(data=coolected_data)
    
    return t.alpha()


def process_toloka_results_batch(file):
        
    coresponding_raw_data_file_path = file[:-4]+'r.tsv'
    
    df_agg, agg_unique_texts = get_unique_texts_from_df(file)
    
    df_raw, raw_unique_texts = get_unique_texts_from_df(coresponding_raw_data_file_path)
    
    if len(raw_unique_texts) != len(agg_unique_texts & raw_unique_texts):
        print(len(raw_unique_texts), len(agg_unique_texts), len(agg_unique_texts & raw_unique_texts))
    
    nltk_alpha = calculate_krippendorf_from_raw_df_nltk(df_raw)
    

    filename = file.split('/')[-1]
    df_agg_report = pd.DataFrame(index = [filename],columns = [['alpha']] , data = [[nltk_alpha]])
    

    return df_agg_report

df_ag_r =  process_toloka_results_batch("./clean_collection/step1/s1g1.tsv",)

In [7]:
df_ag_r.head()

Unnamed: 0,alpha
s1g1.tsv,0.341238


In [10]:
def analysis():
    folder_with_folders = './clean_collection/'
    df_reports_list = []
    value_counts_list = []
    value_counts_strict_data_list = []
    for folder in os.listdir(folder_with_folders):
        final_folder = os.path.join(folder_with_folders, folder)
        if os.path.isdir(final_folder):
            for file in os.listdir(final_folder):
                if re.match('s\dg\d.tsv',file) or re.match('s\d.tsv',file):
                    filepath  = os.path.join(final_folder,file)
                    df_report = process_toloka_results_batch(filepath)
                    df_reports_list.append(df_report)

    df_reports_overall = pd.concat(df_reports_list)
    return df_reports_overall
df_all = analysis()

2135 2124 2124
1760 1759 1759
26434 26434 26432
18319 18319 18317
17455 17455 17454


In [11]:
df_all.head(100)

Unnamed: 0,alpha
step1\s1g1.tsv,0.341238
step1\s1g2.tsv,0.395047
step1\s1g3.tsv,0.369597
step2\s2g1.tsv,0.413621
step2\s2g2.tsv,0.383587
step2\s2g3.tsv,0.375858
step3\s3g1.tsv,0.39813
step3\s3g2.tsv,0.440545
step3\s3g3.tsv,0.413057
step4\s4g1.tsv,0.349669


In [14]:
df_all.columns

MultiIndex([('alpha',)],
           )

In [16]:
df_all[['alpha']].mean()

alpha    0.395936
dtype: float64