In [5]:
import os
import pandas as pd
import numpy as np

In [6]:
from typing import Any, Callable, Iterable, Optional, Sequence, Union

In [7]:
from nltk.metrics import masi_distance

In [8]:
from krippendorff.krippendorff import _coincidences

In [9]:
# https://stats.stackexchange.com/questions/407453/krippendorffs-alpha-in-r-for-multi-label-annotation
import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask


task_data = [('coder1','Item0',frozenset(['l1','l2'])),
('coder2','Item0',frozenset(['l1'])),
('coder1','Item1',frozenset(['l1','l2'])),
('coder2','Item1',frozenset(['l1','l2'])),
('coder1','Item2',frozenset(['l1'])),
('coder2','Item2',frozenset(['l1']))]

task = AnnotationTask(distance = masi_distance)

task.load_array(task_data)

task.alpha()

0.4444444444444444

In [None]:
def calc_agreement_within_group_nofilter(df_group):
    worker2id = {}
    for pair in df_group['ASSIGNMENT:worker_id'].tolist():
        if pair not in worker2id:
            worker2id[pair] = len(worker2id)
    df_group['worker_idx'] = df_group['ASSIGNMENT:worker_id'].map(worker2id)
    
    text2ids = {}
    for t in set(df_group['INPUT:text'].tolist()):
        if t not in text2ids:
            text2ids[t] = len(text2ids)
    df_group['text_idx'] = df_group['INPUT:text'].map(text2ids)
    ids2text = {idx:text for text,idx in text2ids.items()}
    
    reply_columns = [c for c in df_group.columns if 'OUTPUT' in c]
    
    collected_data = []
    for i,el in df_group.iterrows():
        coder_idx = el['worker_idx']
        
        item_idx = el['text_idx']
        labels_list = []
        for repl in reply_columns:
#             print(repl[7:], el[repl])
            if el[repl] == True:
                labels_list.append(repl[7:])
        
        if len(labels_list) == 0: labels_list = ['none']
#         print(labels_list)
        collected_triplet = (coder_idx,item_idx,frozenset(labels_list))
        collected_data.append(collected_triplet)
        
#         collected_texts.append(el['INPUT:text'])
        
#         break
    task = AnnotationTask(distance = masi_distance)
    
#     print("collected_data", collected_data[:10])

    task.load_array(collected_data)

    return task.alpha()


def get_agreement_batch(batch_name, drop_traintest_examples = False, print_output = False,):
    if batch_name == 'first': 
        raw_files = ['g1r.tsv','g2r.tsv','g3r.tsv']
    elif batch_name == 'fourth':
        raw_files = ['g1_4r.tsv','g2_4r.tsv','g3_4r.tsv']
    elif batch_name == 'old':
        raw_files = ['maybe_old/g1r.tsv','maybe_old/g2r.tsv']
    
    index2textset_notrest = {}
    index2textset_withtrtest = {}
    for file in raw_files:
       
        batch_path = "./directly_from_toloka/"+file
        df_curr= pd.read_csv(batch_path, sep = '\t')
        
        index2textset_withtrtest[file] = set(df_curr['INPUT:text'])
        
        if drop_traintest_examples == True:
            golden_column = [c for c in df_curr.columns if 'GOLDEN' in c]
            df_curr = df_curr[df_curr[golden_column[1]].isna()]
        
        index2textset_notrest[file] = set(df_curr['INPUT:text'])
    
    intersect_text_no_tr_test = index2textset_notrest[file]
    engaged_text_withtrtest = index2textset_withtrtest[file]
    for f in index2textset_notrest:
        intersect_text_no_tr_test = intersect_text_no_tr_test.intersection(index2textset_notrest[f])
        engaged_text_withtrtest = engaged_text_withtrtest.union(index2textset_withtrtest[f])
        
    if print_output == True: print("intersecting texts are {} \n".format(len(intersect_text)))
    
    collected_agreements = []
    for i, file in enumerate(raw_files):
        batch_path = "./directly_from_toloka/"+file
        df_curr_common= pd.read_csv(batch_path, sep = '\t')
        
        if drop_traintest_examples == True:
            golden_column = [c for c in df_curr_common.columns if 'GOLDEN' in c]
            df_curr_common = df_curr_common[df_curr_common[golden_column[1]].isna()]
        
        columns_to_merge = [c for c in df_curr_common.columns if c == 'INPUT:text' or 'OUTPUT' in c  or c =='ASSIGNMENT:worker_id']
        df_curr_common = df_curr_common[columns_to_merge]
        
        current_aggr = calc_agreement_within_group_nofilter(df_curr_common)
#         print("current_aggr", current_aggr)
        collected_agreements.append(current_aggr)
        
#         break
    if print_output == True: 
        print("\n")
        print("collected_agreements", collected_agreements)
        print("average agreements", np.mean(collected_agreements))
    return intersect_text_no_tr_test, engaged_text_withtrtest, collected_agreements#intersect_text_no_tr_test - с тренй-тест если выбрать опцию не дропать тртест

    
int_text, eng_text, afrr = get_agreement_batch('first')

In [None]:
afrr

In [None]:
def get_whole_agreement_and_intersections(drop_traintest_examples, iterate_over_batches_list = ['first','fourth','old']):
    intersected_texts_from_batches = set()
    all_unique_texts_from_batches =  set()
    toloka_agreements_list = []
    for batch in iterate_over_batches_list:#
        intersect_texts, all_unique_texts_set, aggr_list = get_agreement_batch(batch, drop_traintest_examples = drop_traintest_examples)
        intersected_texts_from_batches = intersected_texts_from_batches.union(intersect_texts)
        all_unique_texts_from_batches = all_unique_texts_from_batches.union(all_unique_texts_set)
        print("{} batch agreement list - {}".format(batch, aggr_list))
        toloka_agreements_list.extend(aggr_list)
    return intersected_texts_from_batches,all_unique_texts_from_batches, np.mean(toloka_agreements_list)
common_texts_whole_toloka_notrtes, all_engaged_texts, whole_toloka_agreement = get_whole_agreement_and_intersections(True)

In [None]:
# [0.10661798648012055, 0.6364950196276913, 0.3874856997131322] - без трейн-тест семплов

In [None]:
len(common_texts_whole_toloka_notrtes), len(all_engaged_texts), whole_toloka_agreement

In [None]:
common_texts_whole_toloka_withtrtest,  all_engaged_texts, whole_toloka_agreement = get_whole_agreement_and_intersections(drop_traintest_examples = False)

In [None]:
len(common_texts_whole_toloka_withtrtest), len(all_engaged_texts), whole_toloka_agreement

In [None]:
len(common_texts_whole_toloka_withtrtest), len(all_engaged_texts), whole_toloka_agreement

# Проверяем пересечение текстов фактических батчей с сырой толоки с текстами заявленными в спарсенном датасете

In [None]:
import re
def get_len(text):
    if len(text.split()) > 5 and len (text)< 250:
        return True
    return None
def is_mostly_russian(text):
    text = str(text)
    russian_letters_count = len(re.findall("[а-яА-Я]",text))
    eng_letters_count = len(re.findall("[a-zA-Z]",text))
    if russian_letters_count > eng_letters_count:
        return True
    return None
def depersonalize(text):
    text = str(text)
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text= re.sub(url_regex, "url", text)
    text = re.sub("id[\d]*",'',text)
    text = re.sub("> ?",'',text)
    text = re.sub("@[\w]*",'',text)
#     text = re.sub("\+[\d]*", "", text)
    text = re.sub("[\d]+", "NUMBER", text)
    text = re.sub(">>", "", text)
#     text = re.sub("[\d]{3,100}", "", text)
    text = re.sub("[\t|\n|\r]", "", text)
    text = re.sub("[(OP)|url|nickname|phone_number]", "", text)
    return text

In [None]:
df_sensitive = pd.read_csv("sensitive_topics_vs_source.csv")

In [None]:
df_sensitive_toloka = df_sensitive[df_sensitive['source'] == 'toloka']

In [None]:
len(df_sensitive_toloka)

In [None]:
df_sensitive_toloka_texts_set = set(df_sensitive_toloka['text'])
len(df_sensitive_toloka_texts_set)

### если дропать трейн тест

In [None]:
common_texts_whole_toloka_notrtes, all_engaged_texts, whole_toloka_agreement = get_whole_agreement_and_intersections(True)

In [None]:
common_texts_whole_toloka_prep = set([depersonalize(t) for t in common_texts_whole_toloka_notrtes])

In [None]:
len(df_sensitive_toloka_texts_set & common_texts_whole_toloka_prep)

In [None]:
len(df_sensitive_toloka_texts_set & common_texts_whole_toloka_prep)/len(df_sensitive_toloka_texts_set)

### если НЕ дропать трейн тест

In [None]:
common_texts_whole_toloka_withtrtest,  all_engaged_texts, whole_toloka_agreement = get_whole_agreement_and_intersections(drop_traintest_examples = False)

In [None]:
common_texts_whole_toloka_withtrtest_prep = set([depersonalize(t) for t in common_texts_whole_toloka_withtrtest])

In [None]:
len(df_sensitive_toloka_texts_set & common_texts_whole_toloka_withtrtest_prep)

In [None]:
len(df_sensitive_toloka_texts_set & common_texts_whole_toloka_withtrtest_prep)/len(df_sensitive_toloka_texts_set)

In [None]:
# 0.6331054485187487

In [None]:
len(df_sensitive_toloka_texts_set & all_engaged_texts_prep)

In [None]:
len(df_sensitive_toloka_texts_set & all_engaged_texts_prep)/len(df_sensitive_toloka_texts_set)

In [None]:
common_texts_whole_toloka_withtrtest,  all_engaged_texts, whole_toloka_agreement = get_whole_agreement_and_intersections(drop_traintest_examples = False,
                                                                                                                        iterate_over_batches_list = ['first','fourth'])

all_engaged_texts_prep = set([depersonalize(t) for t in all_engaged_texts])

In [None]:
len(df_sensitive_toloka_texts_set & all_engaged_texts_prep)/len(df_sensitive_toloka_texts_set)

# Пытаемся попрунить толокерский датасет для увеличения согласованности

In [10]:
import krippendorff
import re
from collections import Counter

In [37]:
df_sens = pd.read_csv("sensitive_topics_vs_source.csv")
toloka_trtes_set = set(df_sens[df_sens['source']=='toloka_trtest']['text'])

In [None]:
def krippendorf_with_filtering(collected_data,columns,ids2text, admit_multi, min_votes, max_reply_threshold):
    count_dict = {i:{c:0 for c in columns} for i in range(len(ids2text))}

    for _ ,item_idx,label_set in collected_data:
        for label in label_set:
            count_dict[item_idx][label] += 1
            
    value_counts_real = []
    value_counts_pruned = []
    value_counts_pruned_data = []
    
    value_counts_strict = []
    value_counts_strict_data = []
    
    for task_idx in range(len(ids2text)):
        current_count_element_real = [0] * len(columns) 
        current_count_element_pruned = [0] * len(columns)
        for idx, lbl in enumerate(columns):
            if count_dict[task_idx][lbl] > max_reply_threshold:
                print(count_dict[task_idx][lbl],task_idx,  lbl)
            current_count_element_pruned[idx] = min(max_reply_threshold, count_dict[task_idx][lbl])
            value_counts_pruned_data.append(ids2text[task_idx])
                
            current_count_element_real[idx] = count_dict[task_idx][lbl]
           
        value_counts_real.append(current_count_element_real)
        value_counts_pruned.append(current_count_element_pruned)
        
        is_strict = meets_strict_req(current_count_element_pruned,admit_multi, min_votes)
        if is_strict == True:
            value_counts_strict.append(current_count_element_pruned)
            value_counts_strict_data.append(ids2text[task_idx])
     
    aplha_no_strict_real  = krippendorff.alpha(value_counts=np.array(value_counts_real),level_of_measurement='nominal')
    aplha_no_strict_pruned = krippendorff.alpha(value_counts=np.array(value_counts_pruned),level_of_measurement='nominal')
    alpha_strict = krippendorff.alpha(value_counts=np.array(value_counts_strict),level_of_measurement='nominal')
    
    unique_texts_from_all_val_counts = set(value_counts_pruned_data)
    unique_texts_from_strict_val_counts = set(value_counts_strict_data)
    
    return (aplha_no_strict_real,aplha_no_strict_pruned, alpha_strict), unique_texts_from_all_val_counts, unique_texts_from_strict_val_counts
 

In [82]:
def calc_agreement_within_group_filter(df_group):
    worker2id = {}
    for pair in df_group['ASSIGNMENT:worker_id'].tolist():
        if pair not in worker2id:
            worker2id[pair] = len(worker2id)
    df_group['worker_idx'] = df_group['ASSIGNMENT:worker_id'].map(worker2id)
    
    text2ids = {}
    for t in set(df_group['INPUT:text'].tolist()):
        if t not in text2ids:
            text2ids[t] = len(text2ids)
    df_group['text_idx'] = df_group['INPUT:text'].map(text2ids)
    ids2text = {idx:text for text,idx in text2ids.items()}
    
    reply_columns = [c for c in df_group.columns if 'OUTPUT' in c]
    
    collected_data = []
    item_idx_dict = {}
    
    for i,el in df_group.iterrows():
        coder_idx = el['worker_idx']
        
        item_idx = el['text_idx']
        
        if item_idx in item_idx_dict:
            item_idx_dict[item_idx] += 1
        else:
            item_idx_dict[item_idx] = 1
            
        labels_list = []
        for repl in reply_columns:
            if el[repl] == True:
                labels_list.append(repl[7:])

        if len(labels_list) == 0: labels_list = ['none']
        collected_triplet = (coder_idx,item_idx,frozenset(labels_list))
        collected_data.append(collected_triplet)
        
    task = AnnotationTask(distance = masi_distance)
    

    task.load_array(collected_data)
    
    return task.alpha() #, collected_data, [r[7:] for r in reply_columns], ids2text


def meets_strict_req(element_list,admit_multi, min_votes):
    admitable_multichoice_count = admit_multi
    multichoice_list = [el for el in element_list if el > min_votes] 
    if len(multichoice_list) <= admitable_multichoice_count:
        return True
    else:
        return False


def get_high_conf_texts(df_hc, thrsh):
    confidnece_columns = []
    for c in df_hc.columns:
        if 'CONFIDENCE' in c:
            df_hc[c] = df_hc[c].apply(lambda x: float(x[:-1])/100)
            confidnece_columns.append(c)
            
    df_hc['meet_req'] = df_hc.apply(lambda x: all([x[c] > thrsh for c in confidnece_columns]), axis = 1)
    
    return set(df_hc[df_hc['meet_req'] == True]['INPUT:text'])
    
#admit_multi, min_votes, 
def get_aggr_batch_with_filtering(batch_name, drop_unconfident_thrsh = None, 
                                  drop_traintest_examples = False, print_output = False, drop_trtest = True):
    if batch_name == 'first': 
        raw_files = ['g1r.tsv','g2r.tsv','g3r.tsv']
    elif batch_name == 'fourth':
        raw_files = ['g1_4r.tsv','g2_4r.tsv','g3_4r.tsv']
    elif batch_name == 'old':
        raw_files = ['maybe_old/g1r.tsv','maybe_old/g2r.tsv']
    
    engaged_texts= set()
   
    collected_agreements = []

    for i, file in enumerate(raw_files):
        print("="*100)
        batch_path = "./directly_from_toloka/"+file
        df_curr_common= pd.read_csv(batch_path, sep = '\t')
        
        if drop_trtest == True:
            print("before dropping trtest ...", len(df_curr_common))
            df_curr_common = df_curr_common[~df_curr_common['INPUT:text'].isin(toloka_trtes_set)]#!!
            print("after dropping trtest ...", len(df_curr_common))
            
        if drop_unconfident_thrsh:
            file_agg = re.sub('r','',file)
            batch_path_agg = "./directly_from_toloka/"+file_agg
            df_curr_agg = pd.read_csv(batch_path_agg, sep = '\t')
        
            current_high_conf_texts = get_high_conf_texts(df_curr_agg, drop_unconfident_thrsh)
            
            print("before dropping low conf ...", len(df_curr_common))
            df_curr_common = df_curr_common[df_curr_common['INPUT:text'].isin(current_high_conf_texts)]
            print("after dropping low conf ...", len(df_curr_common))
                
        engaged_texts = engaged_texts.union(set(df_curr_common['INPUT:text']))
        
        columns_to_merge = [c for c in df_curr_common.columns if c == 'INPUT:text' or 'OUTPUT' in c  or c =='ASSIGNMENT:worker_id']
        df_curr_common = df_curr_common[columns_to_merge]
        
        alpha_curr = calc_agreement_within_group_filter(df_curr_common)
            
        collected_agreements.append(alpha_curr)
            
    return collected_agreements, engaged_texts

aggr, eng_txts = get_aggr_batch_with_filtering('first',drop_unconfident_thrsh = 0.8, print_output = True)

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7668
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 8540
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 8641


In [83]:
aggr, len(eng_txts)

([0.39641539683044824, 0.6211801748074177, 0.39857040655455145], 2289)

In [84]:
def get_filter_stats_from_all_batches(drop_unconfident_thrsh, batches = ['first','fourth','old'],drop_trtest=True):
    
    collected_agreements_no_pruned = []
    
    texts_set = set ()
    for batch in batches:#
        aggr_lst_curr, eng_txts_curr = get_aggr_batch_with_filtering(batch, 
                                                                     drop_unconfident_thrsh = drop_unconfident_thrsh,
                                                                    drop_trtest = drop_trtest)
        collected_agreements_no_pruned.extend(aggr_lst_curr)
        
        texts_set = texts_set.union(eng_txts_curr)
    
    return np.mean(aggr_lst_curr), texts_set
    
agrmnt, alltxtx = get_filter_stats_from_all_batches(0.8)

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7668
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 8540
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 8641
before dropping trtest ... 33186
after dropping trtest ... 27715
before dropping low conf ... 27715
after dropping low conf ... 19448
before dropping trtest ... 39363
after dropping trtest ... 33894
before dropping low conf ... 33894
after dropping low conf ... 25467
before dropping trtest ... 27241
after dropping trtest ... 23246
before dropping low conf ... 23246
after dropping low conf ... 16550
before dropping trtest ... 8666
after dropping trtest ... 8042
before dropping low conf ... 8042
after dropping low conf ... 3969
before dropping trtest ... 3898
after dropping trtest ... 3678
before

In [85]:
agrmnt, len(alltxtx)

(0.4744957067281021, 8694)

In [86]:
def check_different_filtering_options(batches = ['first','fourth','old'], drop_trtest = True):
    data = []
    for ds_thrsh_curr in [0.95,0.98]:
        agg, engaged_txt = get_filter_stats_from_all_batches(batches = batches, drop_unconfident_thrsh = ds_thrsh_curr, drop_trtest = drop_trtest)
        data.append([ds_thrsh_curr, agg, len(engaged_txt)])
    df_rep = pd.DataFrame(data = data, columns = ['threshold','agreeement_via_threshold','all_engaged_texts'])
    return df_rep

In [87]:
df_no_trtes_rep = check_different_filtering_options(drop_trtest=True)
df_no_trtes_rep.head()

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7649
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 7797
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 6870
before dropping trtest ... 33186
after dropping trtest ... 27715
before dropping low conf ... 27715
after dropping low conf ... 15187
before dropping trtest ... 39363
after dropping trtest ... 33894
before dropping low conf ... 33894
after dropping low conf ... 23329
before dropping trtest ... 27241
after dropping trtest ... 23246
before dropping low conf ... 23246
after dropping low conf ... 14049
before dropping trtest ... 8666
after dropping trtest ... 8042
before dropping low conf ... 8042
after dropping low conf ... 1972
before dropping trtest ... 3898
after dropping trtest ... 3678
before

Unnamed: 0,threshold,agreeement_via_threshold,all_engaged_texts
0,0.95,0.608286,7610
1,0.98,0.669712,7066


In [88]:
df = check_different_filtering_options(batches = ['first','fourth'], drop_trtest = True)
df.head()

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7649
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 7797
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 6870
before dropping trtest ... 33186
after dropping trtest ... 27715
before dropping low conf ... 27715
after dropping low conf ... 15187
before dropping trtest ... 39363
after dropping trtest ... 33894
before dropping low conf ... 33894
after dropping low conf ... 23329
before dropping trtest ... 27241
after dropping trtest ... 23246
before dropping low conf ... 23246
after dropping low conf ... 14049
before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7642
before dropping trtest ... 15009
after dropping trtest ... 12250
b

Unnamed: 0,threshold,agreeement_via_threshold,all_engaged_texts
0,0.95,0.640861,6681
1,0.98,0.676742,6501
