In [1]:
import math
import numpy as np
import random
from scipy.stats import beta
import collections
import pandas as pd
from tqdm import tqdm

In [2]:
class Workers:

    def __init__(self, workers_num, cheaters_prop, fixed_acc = False, workers_acc = .5):
        self.workers_num = workers_num
        self.cheaters_prop = cheaters_prop
        self.acc_passed = []
        self.fixed_acc = fixed_acc
        self.workers_acc = workers_acc

    # simulate workers
    def simulate_workers(workers_num, cheaters_prop, fixed_acc, workers_acc):
        for _ in range(workers_num):
            if (fixed_acc == False):
                if np.random.binomial(1, cheaters_prop):
                    # worker_type is 'rand_ch'
                    worker_acc_pos = 0.5
                else:
                    # worker_type is 'worker'
                    worker_acc_pos = 0.8 + (np.random.beta(1, 1) * 0.2)
            else:
                worker_acc_pos = workers_acc
            
            self.acc_passed.append(worker_acc_pos)
        #end for
            
        return self.acc_passed

In [3]:
#classification functions
def classification_fn_posterior(votes, prior, accuracy):    
    n = len(votes)
    y = sum(votes.values())

    likelihood = binomial_likelihood(prior, n, y)

    #bayes theorem
    posterior = (likelihood * prior) / ((likelihood * prior) + (1 - accuracy) * (1 - prior))

    return posterior

def classification_fn_beta_pdf(votes, th, accuracy):    
    n = len(votes)
    y = sum(votes.values())

    posterior = beta.sf(th, 1 + y, 1 + (n - y))

    return posterior

def classification_fn_mv(votes):
    n = len(votes)
    s = sum(votes.values())

    return sum(votes.values()) / len(votes)

In [4]:
class Generator:

    def __init__(self, params):
        self.workers_accuracy = params['workers_accuracy']
        self.workers_num = params['workers_num']      
        self.items_num = params['items_num']      
        self.cost_ratio = params.get('cost_ratio')
        self.classification_threshold = params.get('classification_threshold')
        self.index_workers_voted_on_item = {}
        self.votes_per_item = params['votes_per_item']
        self.classification_fn = params['classification_fn']
    
    def generate_gold_data(self, items_num, true_percentage):
        gold_data = []
        for item_index in range(items_num):
            if np.random.binomial(1, true_percentage):
                val = 1
            else:
                val = 0
            gold_data.append(val)
        #end for
        return gold_data
    
    def get_random_worker_accuracy(self, item, items_num):       
        '''
        #TO-DO: add logic to avoid worker vote on same task
        worker_found = False
        
        while (worker_found == False):
            index = np.random.randn(0, self.workers_num - 1)

            if (index not in self.index_workers_voted_on_item[item]):
                self.index_workers_voted_on_item[item].append(index)
                worker_found = True
        ''' 
        worker_id = random.randint(0, self.workers_num - 1)
        return (worker_id, self.workers_accuracy[worker_id])
    
    def get_worker_vote(self, i, items_num):
        worker_id, worker_acc = self.get_random_worker_accuracy(i, items_num)
        
        if np.random.binomial(1, worker_acc):
            vote = 1
        else:
            vote = 0
            
        return (worker_id, vote)
    
    def get_items_predicted_classified(results):
        return {i:v for (i,v) in results.items() if v == True}
    
    def generate_votes_gt(self, items_num, ct):
        total_votes = collections.defaultdict(dict)
        
        #base votes
        for i in range(items_num):
            for k in range(self.votes_per_item):
                worker_id, vote = self.get_worker_vote(i, items_num)

                total_votes[i][worker_id] = vote

        #evaluate votes
        results = decision_fn(items_num, total_votes, ct, self.cost_ratio, 
                                                       self.classification_fn)
        #Check if must continue collecting votes
        items_predicted_classified = Generator.get_items_predicted_classified(results)
        must_get_more_votes = len(items_predicted_classified) > 0
        
      
        while(must_get_more_votes):
            for i, v in items_predicted_classified.items():
                worker_id, vote = self.get_worker_vote(i, items_num)

                total_votes[i][worker_id] = vote                
            #end for
            results = decision_fn(items_num, total_votes, ct, self.cost_ratio, 
                                                       self.classification_fn)
            
            #Stop when there are no more items that can be classified
            items_predicted_classified = Generator.get_items_predicted_classified(results)
            must_get_more_votes = len(items_predicted_classified) > 0
        #end while
            
        return total_votes

In [5]:

'''
Function to answer: must continue collecting votes over each task?

Input:
items_num - amount if items
votes - dictionary of dictionaries, containing the votes over each item where keys corresponds to workers ID
    {
        item_i: {worker_i:vote...worker_n:vote},
        ...
        item_n: {worker_i:vote...worker_n:vote},
    }
classification_threshold - value between 0 and 1 for deciding if prob of data is enough or must continue
cost_ratio - ratio of crowd to expert cost, value between 0 and 1
classification_function - function to calculate how likely is to be classified

Output:
    Dictionary with the decision indexed by item_id
        {
            item_id: bool
            ...
            item_n: ...
        }
    Where False = Stop and True=Continue collecting votes
'''
def decision_fn(items_num, votes, classification_threshold, cost_ratio, classification_function):      
    expert_cost = 1 / cost_ratio  
    results = dict.fromkeys(range(items_num), False)

    for item_id in range(items_num):            
        item_votes = votes[item_id].copy()
        actual_cost = len(item_votes) #actual cost per item i

        #First case
        #prob with actual votes
        classification_prob = classification_function(item_votes)
        if classification_prob <= classification_threshold:
            item_simulated_result = simulate_drawing(item_votes, classification_threshold, classification_function, actual_cost, expert_cost)

            if(item_simulated_result['cost_mean'] <= expert_cost): #must take cost mean and std into consideration?
                results[item_id] = True
    #end for              

    return results
#end decision_fn

'''
    Input:
        votes - actual votes on item 
        classification_threshold - value between 0 and 1 for deciding if prob of data is enough or must continue
        classification_function - function to calculate how likely is to be classified
        actual_cost - cost for the actual votes
        expert_cost - cost for expert vote
    Output:
        {
            'cost_mean': mean of the cost over N iterations,
            'cost_std': std of the cost over N iterations
        }
'''
def simulate_drawing(votes, classification_threshold, classification_function, actual_cost, expert_cost):
    must_continue = True
    simulated_costs = []
    for _ in range(drawing_simulations_amount):
        #while: item not classified or not too expensive             
        while (must_continue == True):
            #prob with actual votes
            classification_prob = classification_function(votes) #mv

            if classification_prob > classification_threshold:
                must_continue = False            
                simulated_costs.append(actual_cost)
            else:
                #draw vote
                vote = np.random.binomial(1, classification_prob)
                new_index = max(votes.keys()) + 1
                votes[new_index] = vote
                actual_cost += 1 #increment actual cost with each simulated vote 
                if(actual_cost >= (expert_cost * expert_cost_increment)):
                    #Set false if the item is too expensive
                    must_continue = False
                    simulated_costs.append(actual_cost)
        #end while
    #end for

    return {
            'cost_mean': np.mean(simulated_costs),
            'cost_std': np.std(simulated_costs)
           }
#end simulate_drawing

In [6]:
#Assumptions
#1 condition
#difficulty of tasks are all equal
#there are no test questions
#there are no cheaters

z = 0 #% cheaters?
items_num = 100
cts = np.arange(.7, .96, 0.05) #classification thresholds
cr = .01 #ratio 1:100
votes_per_item = 3
iter_num = 50 #fixed
workers_num = 1000
fixed_acc = True
workers_acc = .9
true_percentage = 1
expert_cost_increment = 2
drawing_simulations_amount = 50

main_results = []

workers_accuracy = Workers(workers_num, z, fixed_acc, workers_acc).simulate_workers()

params = {
    'workers_accuracy': workers_accuracy,
    'workers_num': workers_num,
    'items_num': items_num,
    'cost_ratio': cr,
    'votes_per_item': votes_per_item,
    'classification_fn': classification_fn_mv
}

for ct in tqdm(cts):
    ct = round(ct, 2) #limit to two decimals
    cost = []
    items_classified_in = []
    items_classified_out = []
    
    for _ in range(iter_num):
        #ground_truth = Generator(params).generate_gold_data(items_num, true_percentage)

        th_total_votes = Generator(params).generate_votes_gt(items_num, ct)

        cost.append(sum([len(v) for (x,v) in th_total_votes.items()]) * cr)
    #end for iterations
    
    main_results.append([ct, round(np.mean(cost), 3), round(np.std(cost), 3), round(np.mean(cost) / items_num, 3)])
#end for thresholds

print("Iterations per ct: {}".format(iter_num))
print("Items: {}".format(items_num))
pd.DataFrame(main_results, columns=["Threshold","Cost Avg","Cost Std", "Cost per item"])

100%|██████████| 6/6 [02:25<00:00, 34.01s/it]

Iterations per ct: 50
Items: 100





Unnamed: 0,Threshold,Cost Avg,Cost Std,Cost per item
0,0.7,3.371,0.138,0.034
1,0.75,3.742,0.219,0.037
2,0.8,4.173,0.332,0.042
3,0.85,4.497,0.511,0.045
4,0.9,5.78,1.015,0.058
5,0.95,5.37,1.232,0.054


In [5]:
from scipy.special import binom
[1,1,1,1,0]
binom(2,.5)

1.6976527263135501