In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from typing import Any, Callable, Iterable, Optional, Sequence, Union

In [14]:
from nltk.metrics import masi_distance
from collections import Counter
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask

In [4]:
import re

def depersonalize(text):
    text = str(text)
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text= re.sub(url_regex, "url", text)
    text = re.sub("id[\d]*",'',text)
    text = re.sub("> ?",'',text)
    text = re.sub("@[\w]*",'',text)
#     text = re.sub("\+[\d]*", "", text)
    text = re.sub("[\d]+", "NUMBER", text)
    text = re.sub(">>", "", text)
#     text = re.sub("[\d]{3,100}", "", text)
    text = re.sub("[\t|\n|\r]", "", text)
    text = re.sub("[(OP)|url|nickname|phone_number]", "", text)
    return text

In [5]:
df_sensitive = pd.read_csv("sensitive_topics_vs_source.csv")

In [11]:
df_sensitive_toloka = df_sensitive[df_sensitive['source'] == 'toloka']
toloka_trtes_set = set(df_sensitive[df_sensitive['source']=='toloka_trtest']['text'])

In [7]:
len(df_sensitive_toloka)

9278

In [8]:
df_sensitive_toloka_texts_set = set(df_sensitive_toloka['text'])
len(df_sensitive_toloka_texts_set)

9278

In [17]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [16]:
def calc_agreement_nltk(df_group):
    worker2id = {}
    for pair in df_group['ASSIGNMENT:worker_id'].tolist():
        if pair not in worker2id:
            worker2id[pair] = len(worker2id)
    df_group['worker_idx'] = df_group['ASSIGNMENT:worker_id'].map(worker2id)
    
    text2ids = {}
    for t in set(df_group['INPUT:text'].tolist()):
        if t not in text2ids:
            text2ids[t] = len(text2ids)
    df_group['text_idx'] = df_group['INPUT:text'].map(text2ids)
    ids2text = {idx:text for text,idx in text2ids.items()}
    
    reply_columns = [c for c in df_group.columns if 'OUTPUT' in c]
    
    collected_data = []
    item_idx_dict = {}
    
    for i,el in df_group.iterrows():
        coder_idx = el['worker_idx']
        
        item_idx = el['text_idx']
        
        if item_idx in item_idx_dict:
            item_idx_dict[item_idx] += 1
        else:
            item_idx_dict[item_idx] = 1
            
        labels_list = []
        for repl in reply_columns:
            if el[repl] == True:
                labels_list.append(repl[7:])

        if len(labels_list) == 0: labels_list = ['none']
        collected_triplet = (coder_idx,item_idx,frozenset(labels_list))
        collected_data.append(collected_triplet)
        
    task = AnnotationTask(distance = masi_distance)
    
    task.load_array(collected_data)
    
    return task.alpha() #,

def get_high_conf_texts(df_hc, thrsh):
    confidnece_columns = []
    for c in df_hc.columns:
        if 'CONFIDENCE' in c:
            df_hc[c] = df_hc[c].apply(lambda x: float(x[:-1])/100)
            confidnece_columns.append(c)
            
    df_hc['meet_req'] = df_hc.apply(lambda x: all([x[c] > thrsh for c in confidnece_columns]), axis = 1)
    
    return set(df_hc[df_hc['meet_req'] == True]['INPUT:text'])
    
def get_aggr_batch_with_filtering(batch_name, drop_unconfident_thrsh = None, 
                                  drop_traintest_examples = False, print_output = False, drop_trtest = True):
    if batch_name == 'first': 
        raw_files = ['g1r.tsv','g2r.tsv','g3r.tsv']
    elif batch_name == 'fourth':
        raw_files = ['g1_4r.tsv','g2_4r.tsv','g3_4r.tsv']
    elif batch_name == 'old':
        raw_files = ['maybe_old/g1r.tsv','maybe_old/g2r.tsv']
    
    engaged_texts= set()
   
    collected_agreements = []

    for i, file in enumerate(raw_files):
        if print_output == True: print("="*100)
        batch_path = "./directly_from_toloka/"+file
        df_curr_common= pd.read_csv(batch_path, sep = '\t')
        
        if drop_trtest == True:
            if print_output == True: print("before dropping trtest ...", len(df_curr_common))
            df_curr_common = df_curr_common[~df_curr_common['INPUT:text'].isin(toloka_trtes_set)]#!!
            if print_output == True: print("after dropping trtest ...", len(df_curr_common))
            
        if drop_unconfident_thrsh:
            file_agg = re.sub('r','',file)
            batch_path_agg = "./directly_from_toloka/"+file_agg
            df_curr_agg = pd.read_csv(batch_path_agg, sep = '\t')
        
            current_high_conf_texts = get_high_conf_texts(df_curr_agg, drop_unconfident_thrsh)
            
            if print_output == True: print("before dropping low conf ...", len(df_curr_common))
            df_curr_common = df_curr_common[df_curr_common['INPUT:text'].isin(current_high_conf_texts)]
            if print_output == True: print("after dropping low conf ...", len(df_curr_common))
                
        engaged_texts = engaged_texts.union(set(df_curr_common['INPUT:text']))
        
        columns_to_merge = [c for c in df_curr_common.columns if c == 'INPUT:text' or 'OUTPUT' in c  or c =='ASSIGNMENT:worker_id']
        df_curr_common = df_curr_common[columns_to_merge]
        
        alpha_curr = calc_agreement_nltk(df_curr_common)
            
        collected_agreements.append(alpha_curr)
            
    return collected_agreements, engaged_texts

aggr, eng_txts = get_aggr_batch_with_filtering('first',drop_unconfident_thrsh = 0.8, print_output = True)

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7668
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 8540
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 8641


In [83]:
aggr, len(eng_txts)

([0.39641539683044824, 0.6211801748074177, 0.39857040655455145], 2289)

In [84]:
def get_filter_stats_from_all_batches(drop_unconfident_thrsh, batches = ['first','fourth','old'],drop_trtest=True):
    
    collected_agreements_no_pruned = []
    
    texts_set = set ()
    for batch in batches:#
        aggr_lst_curr, eng_txts_curr = get_aggr_batch_with_filtering(batch, 
                                                                     drop_unconfident_thrsh = drop_unconfident_thrsh,
                                                                    drop_trtest = drop_trtest)
        collected_agreements_no_pruned.extend(aggr_lst_curr)
        
        texts_set = texts_set.union(eng_txts_curr)
    
    return np.mean(aggr_lst_curr), texts_set
    
agrmnt, alltxtx = get_filter_stats_from_all_batches(0.8)

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7668
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 8540
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 8641
before dropping trtest ... 33186
after dropping trtest ... 27715
before dropping low conf ... 27715
after dropping low conf ... 19448
before dropping trtest ... 39363
after dropping trtest ... 33894
before dropping low conf ... 33894
after dropping low conf ... 25467
before dropping trtest ... 27241
after dropping trtest ... 23246
before dropping low conf ... 23246
after dropping low conf ... 16550
before dropping trtest ... 8666
after dropping trtest ... 8042
before dropping low conf ... 8042
after dropping low conf ... 3969
before dropping trtest ... 3898
after dropping trtest ... 3678
before

In [85]:
agrmnt, len(alltxtx)

(0.4744957067281021, 8694)

In [86]:
def check_different_filtering_options(batches = ['first','fourth','old'], drop_trtest = True):
    data = []
    for ds_thrsh_curr in [0.95,0.98]:
        agg, engaged_txt = get_filter_stats_from_all_batches(batches = batches, drop_unconfident_thrsh = ds_thrsh_curr, drop_trtest = drop_trtest)
        
        data.append([ds_thrsh_curr, agg, len(engaged_txt)])
    df_rep = pd.DataFrame(data = data, columns = ['threshold','agreeement_via_threshold','all_engaged_texts'])
    return df_rep

In [87]:
df_no_trtes_rep = check_different_filtering_options(drop_trtest=True)
df_no_trtes_rep.head()

before dropping trtest ... 12705
after dropping trtest ... 10006
before dropping low conf ... 10006
after dropping low conf ... 7649
before dropping trtest ... 15009
after dropping trtest ... 12250
before dropping low conf ... 12250
after dropping low conf ... 7797
before dropping trtest ... 16934
after dropping trtest ... 13810
before dropping low conf ... 13810
after dropping low conf ... 6870
before dropping trtest ... 33186
after dropping trtest ... 27715
before dropping low conf ... 27715
after dropping low conf ... 15187
before dropping trtest ... 39363
after dropping trtest ... 33894
before dropping low conf ... 33894
after dropping low conf ... 23329
before dropping trtest ... 27241
after dropping trtest ... 23246
before dropping low conf ... 23246
after dropping low conf ... 14049
before dropping trtest ... 8666
after dropping trtest ... 8042
before dropping low conf ... 8042
after dropping low conf ... 1972
before dropping trtest ... 3898
after dropping trtest ... 3678
before

Unnamed: 0,threshold,agreeement_via_threshold,all_engaged_texts
0,0.95,0.608286,7610
1,0.98,0.669712,7066


In [80]:
df = check_different_filtering_options(batches = 'first','fourth')
df.head()

Unnamed: 0,threshold,agreeement_via_threshold,all_engaged_texts
0,0.95,0.606345,7804
1,0.98,0.667156,7247
