In [1]:
import os
import pandas as pd
from nltk.metrics.agreement import AnnotationTask
import krippendorff
import numpy as np
from collections import Counter

In [2]:
df_agg_test = pd.read_csv("./clean_collection/step1/s1g1.tsv", sep = '\t')
df_raw_test = pd.read_csv("./clean_collection/step1/s1g1r.tsv", sep = '\t')

In [3]:
df_agg_test.head()

Unnamed: 0,INPUT:text,OUTPUT:result,CONFIDENCE:result,GOLDEN:result,Unnamed: 4
0,вполне вероятно- поврежден матрикс..нужен врач...,no,99.99%,,
1,Работа проституткой это стыдно? интересует сов...,no,95.89%,,
2,"пожарные, выдвижные-сборные для крыш в основном",no,99.99%,,
3,Помогите решить задачу по нахождению уровня бе...,no,99.99%,,
4,Больше автоматизации уволеных рабочих людей не...,yes,95.27%,,


In [4]:
df_raw_test.dropna(subset = ['OUTPUT:result'], inplace = True)
df_raw_test[['ASSIGNMENT:worker_id','INPUT:text','OUTPUT:result']].head()

Unnamed: 0,ASSIGNMENT:worker_id,INPUT:text,OUTPUT:result
0,b0df6d039a6ba8737dbe14c3abf69935,"тебе доктора говорят, ты - не веришь, а дворни...",yes
1,b0df6d039a6ba8737dbe14c3abf69935,какие бывают комбинации в покере,no
2,b0df6d039a6ba8737dbe14c3abf69935,А как работает карбюратор на движках с турбона...,no
3,b0df6d039a6ba8737dbe14c3abf69935,"Слушай, тут дети, может не будешь такие пошлос...",no
4,b0df6d039a6ba8737dbe14c3abf69935,Сколько зарабатывает начинающий фотограф в час...,yes


In [5]:
def get_unique_texts_from_df(path):
    df = pd.read_csv(path, sep = '\t')
    df = df[df['GOLDEN:result'].isna()]
    df.dropna(subset = ['INPUT:text'], inplace = True)
    return df, set(df['INPUT:text'])

def calculate_krippendorf_from_raw_df_nltk(df):
    coolected_data = []
    collected_texts = []
    named_collected_data_dict = {}
    
    text2ids = {}
    for t in set(df['INPUT:text'].tolist()):
        if t not in text2ids:
            text2ids[t] = len(text2ids)
    df['text_idx'] = df['INPUT:text'].map(text2ids)
    ids2text = {idx:text for text,idx in text2ids.items()}
    
    worker2id = {}
    for pair in df['ASSIGNMENT:worker_id'].tolist():
        if pair not in worker2id:
            worker2id[pair] = len(worker2id)
    df['worker_idx'] = df['ASSIGNMENT:worker_id'].map(worker2id)
    
    for i,el in df.iterrows():
        coder_idx = el['worker_idx']
        item_idx = el['text_idx']
        label = el['OUTPUT:result']
        
        collected_triplet = (coder_idx,item_idx,label )
        coolected_data.append(collected_triplet)
        
#         data_curr = [el['INPUT:text'],el['OUTPUT:result'],el['CONFIDENCE:result']]
        collected_texts.append(el['INPUT:text'])
    
    t = AnnotationTask(data=coolected_data)
    
    return t.alpha(),coolected_data, ids2text

def krippendorf_with_filtering(collected_data,ids2text, constraints_type, df_agg_high_quality):
    count_dict = {i:{'no':0,"yes":0} for i in range(len(ids2text))}
    
    text2data = {el['INPUT:text']:[el['OUTPUT:result'],el['CONFIDENCE:result']] for _,el in  df_agg_high_quality.iterrows()}
    
    for _ ,item_idx,label in collected_data:
        count_dict[item_idx][label] += 1
        
    value_counts = []
    value_counts_strict = []
    value_counts_strict_data = []
    
    strict_tasks_ids2vote_dict = {}
    for task_idx in range(len(ids2text)):
        current_count_element = [0,0]
        for idx, lbl in enumerate(["no","yes"]):
            current_count_element[idx] = count_dict[task_idx][lbl]
     
        value_counts.append(current_count_element)
        
        current_count_element_string = ''.join(sorted([str(el) for el in current_count_element if el != 0]))
        
        if constraints_type == "top_confidence_only":
            STRICT_CONSTRAINTS = ('2','3','4','5','6','7','8','9','10')
        elif constraints_type in ("top_confidence_vs_oneAND","top_confidence_vs_twoAND"):
             STRICT_CONSTRAINTS = ['2','3','4','5','6','7','8','9','10']
             additional_elements = []
             for strinct_el in STRICT_CONSTRAINTS:
                if strinct_el!= '10':
                    if constraints_type == "top_confidence_vs_oneAND":
                        new_el = '1'+strinct_el
                        additional_elements.append(new_el)
                    elif constraints_type == "top_confidence_vs_twoAND":
                        new_el = '1'+strinct_el
                        additional_elements.append(new_el)
                        
                        new_el = '2'+strinct_el
                        additional_elements.append(new_el)
                        
             STRICT_CONSTRAINTS.extend(additional_elements)
                
        if current_count_element_string in STRICT_CONSTRAINTS:
            value_counts_strict.append(current_count_element)
                 
            curr_data = [ids2text[task_idx]]
            text_res = text2data[ids2text[task_idx]]
            curr_data.extend(text_res)
            value_counts_strict_data.append(curr_data)
            
#     value_counts_adjusted = [''.join(sorted([str(el) for el in vc if el != 0])) for vc in value_counts]
#     print("MOST COMMON VOTE COMBINATIONS", Counter(value_counts_adjusted).most_common(100))
    
    alpha_no_strict = krippendorff.alpha(value_counts=np.array(value_counts),level_of_measurement='nominal')
    alpha_strict = krippendorff.alpha(value_counts=np.array(value_counts_strict),level_of_measurement='nominal')
    return alpha_no_strict, alpha_strict, value_counts_strict, value_counts_strict_data

def process_toloka_results_batch(file, agg_confidence_threshold,constraints_type, debug = False):
    coresponding_raw_data_file_path = file[:-4]+'r.tsv'
    
    df_agg, agg_unique_texts = get_unique_texts_from_df(file)
    
    df_raw, raw_unique_texts = get_unique_texts_from_df(coresponding_raw_data_file_path)
    
    if len(raw_unique_texts) != len(agg_unique_texts & raw_unique_texts):
        if debug == True:
            print(len(raw_unique_texts), len(agg_unique_texts), len(agg_unique_texts & raw_unique_texts))
       
    df_agg['CONFIDENCE:result'] = df_agg['CONFIDENCE:result'].apply(lambda x: float(x[:-1])/100)
    df_agg_high_quality = df_agg[df_agg['CONFIDENCE:result']>agg_confidence_threshold]
    df_raw_high_quality = df_raw[df_raw['INPUT:text'].isin(df_agg_high_quality['INPUT:text'])]
    
    nltk_alpha,collected_data_alpha, ids2text_from_raw = calculate_krippendorf_from_raw_df_nltk(df_raw_high_quality)
    if debug == True:
        print("nltk_alpha", nltk_alpha)
        print("collected_data_alpha",collected_data_alpha[:10])
        print('\n\n')
        
    assert abs(len(df_agg_high_quality) - len(ids2text_from_raw)) < 15, "{}, {}".format(len(df_agg_high_quality), len(ids2text))
    alpha_no_strict, alpha_strict, value_counts_strict, value_counts_strict_data = krippendorf_with_filtering(collected_data_alpha, ids2text_from_raw,
                                                                                                              constraints_type, df_agg_high_quality)
    
    assert round(alpha_no_strict,2) == round(nltk_alpha,2), print(alpha_no_strict, nltk_alpha)
    
#     df_raw_report = pd.DataFrame(columns = ['raw_init','raw_init_conf'], 
#                                  data = [[len(df_raw), len(df_raw_high_quality)]])
#     if debug == True: print(df_raw_report) 
    filename = file.split('/')[-1]
    df_agg_report = pd.DataFrame(index = [filename],columns = ['aggreg_samples','aggreg_samples_high_conf',"alpha_high_conf","aggreg_samples_high_conf_and_agreemnt","alpha_high_conf_agreement"], 
                                 data = [[len(df_agg), len(df_agg_high_quality),alpha_no_strict,len(value_counts_strict),alpha_strict]])
    
        
    return df_agg_report, value_counts_strict, value_counts_strict_data
    
# process_toloka_results_batch("./clean_collection/step1/s1g1.tsv",0.95,"top_confidence_only")
# process_toloka_results_batch("./clean_collection/step1/s1g1.tsv",0.95,"top_confidence_vs_oneAND")
df_ag_r, vc, vcd = process_toloka_results_batch("./clean_collection/step1/s1g1.tsv",0.95,"top_confidence_vs_twoAND", debug = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_idx'] = df['INPUT:text'].map(text2ids)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['worker_idx'] = df['ASSIGNMENT:worker_id'].map(worker2id)


nltk_alpha 0.46175847035987416
collected_data_alpha [(0, 264, 'yes'), (0, 609, 'no'), (0, 24, 'yes'), (0, 525, 'yes'), (0, 157, 'no'), (1, 609, 'yes'), (1, 157, 'no'), (1, 264, 'no'), (1, 24, 'yes'), (1, 525, 'yes')]





In [7]:
df_agg_test.head()

Unnamed: 0,INPUT:text,OUTPUT:result,CONFIDENCE:result,GOLDEN:result,Unnamed: 4
0,вполне вероятно- поврежден матрикс..нужен врач...,no,99.99%,,
1,Работа проституткой это стыдно? интересует сов...,no,95.89%,,
2,"пожарные, выдвижные-сборные для крыш в основном",no,99.99%,,
3,Помогите решить задачу по нахождению уровня бе...,no,99.99%,,
4,Больше автоматизации уволеных рабочих людей не...,yes,95.27%,,


In [11]:
idx_t = df_agg_test['INPUT:text'].tolist().index('Христиане, в почему вы считаете, что только Библия истина ?')
df_agg_test.iloc[idx_t]

INPUT:text           Христиане, в почему вы считаете, что только Би...
OUTPUT:result                                                       no
CONFIDENCE:result                                               99.99%
GOLDEN:result                                                      NaN
Unnamed: 4                                                         NaN
Name: 566, dtype: object

In [12]:
# ПРОВЕРЕНО. НЕ РАСХОДИТСЯ

In [13]:
# vcd[-19:]

In [14]:
df_ag_r

Unnamed: 0,aggreg_samples,aggreg_samples_high_conf,alpha_high_conf,aggreg_samples_high_conf_and_agreemnt,alpha_high_conf_agreement
s1g1.tsv,1017,789,0.461758,699,0.571715


In [9]:
import re
# re.match('s\dg\d.tsv','s1g1.tsv')
re.match('s\dg\d.tsv','s1g1t.tsv')

In [11]:
%%capture --no-display

def analysis(confidence_threshold, agreement_filtering_type):
    folder_with_folders = './clean_collection/'
    df_reports_list = []
    value_counts_list = []
    value_counts_strict_data_list = []
    for folder in os.listdir(folder_with_folders):
        final_folder = os.path.join(folder_with_folders, folder)
        if os.path.isdir(final_folder):
            for file in os.listdir(final_folder):
                if re.match('s\dg\d.tsv',file) or re.match('s\d.tsv',file):
                    filepath  = os.path.join(final_folder,file)
                    df_report, value_counts_strict, value_counts_strict_data = process_toloka_results_batch(filepath,confidence_threshold, agreement_filtering_type)
                    df_reports_list.append(df_report)
                    value_counts_list.extend(value_counts_strict)
                    value_counts_strict_data_list.extend(value_counts_strict_data)
                    
    df_reports_overall = pd.concat(df_reports_list)
#     alpha_strict_total = krippendorff.alpha(value_counts=np.array(value_counts_list),level_of_measurement='nominal')   
    return df_reports_overall, value_counts_list, value_counts_strict_data_list
# dfr, al, vcd  = analysis(0.95,"top_confidence_vs_twoAND")

In [17]:
dfr

Unnamed: 0,aggreg_samples,aggreg_samples_high_conf,alpha_high_conf,aggreg_samples_high_conf_and_agreemnt,alpha_high_conf_agreement
step1\s1g1.tsv,1017,789,0.461758,699,0.571715
step1\s1g2.tsv,2124,1568,0.563686,1537,0.587646
step1\s1g3.tsv,1759,1494,0.459029,1427,0.508008
step2\s2g1.tsv,6426,5156,0.562238,4932,0.61086
step2\s2g2.tsv,12154,9065,0.619214,8834,0.659446
step2\s2g3.tsv,12053,10081,0.497961,9605,0.546595
step3\s3g1.tsv,18912,14476,0.696933,14235,0.716439
step3\s3g2.tsv,10665,8523,0.675676,8473,0.684468
step3\s3g3.tsv,23975,17748,0.715539,17679,0.721802
step4\s4g1.tsv,26434,15953,0.715383,15876,0.720166


In [18]:
vcd[:10]

[['А есть AL патч на Rst Ev 3 ?', 'no', 0.9998999999999999],
 ['На улице всякое может случиться, а к девушкам в ванную вламываться просто нехорошо.',
  'no',
  0.9995999999999999],
 ['Подставляешь вместо f0 значение функции в точке 0.если fx=sx,то ss0=s1=0,а если gx=sx,то ss0=s0=0Вот, собственно, и все.',
  'no',
  0.9990000000000001],
 ['Этот даун опять упал на ровном месте. Как он дожил вообще до своего возраста?',
  'yes',
  0.9936],
 ['так работа для быдла жемимо делаю бизнес', 'yes', 0.992],
 ['например тут  по рекламе ими как раз пользуются КП и АИФ',
  'no',
  0.9995999999999999],
 ['только за услугой поиска работы, безработным вас не признают. вы занятый.',
  'no',
  0.9948],
 ['А мог бы в Дрейка сейчас играть…', 'no', 0.9998999999999999],
 ['Лучшие производители постельного белья. Какая марка, бренд постельного белья вам нравится больше всего?',
  'no',
  0.9673999999999999],
 ['Чё вспомнил Зорро и его немого слугу', 'no', 0.9992]]

In [14]:
%%capture --no-stdout
collected_data = []
for conf in [0, 0.85,0.9,0.95]:
    for aggr_type in ["top_confidence_only","top_confidence_vs_oneAND","top_confidence_vs_twoAND"]:
        print(conf, aggr_type)
        df_curr_report , value_counts_strict , value_counts_strict_data = analysis(conf,aggr_type)
        alpha_strict = krippendorff.alpha(value_counts=np.array(value_counts_strict),level_of_measurement='nominal')
        number_of_kept_samples_conf = df_curr_report['aggreg_samples_high_conf'].sum()
        number_of_kept_samples_conf_vs_agrement = df_curr_report['aggreg_samples_high_conf_and_agreemnt'].sum()
        
        collected_data.append([conf,number_of_kept_samples_conf, 
                               aggr_type,number_of_kept_samples_conf_vs_agrement, alpha_strict])
df_report = pd.DataFrame(data = collected_data, columns=['confidence','samples_kep_conf_only',
                                                         'strictType','samples_kep_conf_and_aggr','alpha_strict'])
df_report.head(10)

0 top_confidence_only


KeyboardInterrupt: 

In [24]:
df_report.head(10)
# С РАЗНЫМИ ВАРИАЦЯМИ ДАТАСЕТОВ ИМЕЮЩИХ НОРМАЛЬНУЮ СОГЛАСОВАННОТЬ ПОГОНЯЬ МОДЕЛЬКИ НА ОДИНКОВЫХ МЕТАПРААМТЕРАХ

Unnamed: 0,confidence,samples_kep_conf_only,strictType,samples_kep_conf_and_aggr,alpha_strict
0,0.85,140798,top_confidence_only,99197,1.0
1,0.85,140798,top_confidence_vs_oneAND,132904,0.604988
2,0.85,140798,top_confidence_vs_twoAND,138160,0.555374
3,0.9,131411,top_confidence_only,97310,1.0
4,0.9,131411,top_confidence_vs_oneAND,124844,0.652478
5,0.9,131411,top_confidence_vs_twoAND,129178,0.604831
6,0.95,113804,top_confidence_only,90383,1.0
7,0.95,113804,top_confidence_vs_oneAND,108991,0.728971
8,0.95,113804,top_confidence_vs_twoAND,112093,0.684328


In [15]:
%%capture --no-stdout
df_rpt, value_counts_stct, data_from_strict_sort  = analysis(0.9,"top_confidence_vs_oneAND")

In [16]:
df_export = pd.DataFrame(data = data_from_strict_sort, columns = ['INPUT:text','OUTPUT:result','CONFIDENCE:result'])

In [17]:
df_export.to_csv("Inappropriate_09_top_vs_one.csv", index = None)

In [None]:
# df_export.to_csv("Inappropriate_095_top_vs_one.csv", index = None)

In [None]:
len(data_from_strict_sort)

In [None]:
krippendorff.alpha(value_counts=np.array(value_counts_stct),level_of_measurement='nominal')   

In [None]:
df_rpt['alpha_high_conf_agreement'].median()

In [None]:
df_rpt['aggreg_samples_high_conf'].sum()

In [None]:
dfr