In [1]:
import numpy as np
from truthdiscovery import (
    AverageLog,
    ConvergenceIterator,
    FixedIterator,
    Investment,
    MajorityVoting,
    PooledInvestment,
    Sums,
    TruthFinder,
    SyntheticData,
    Dataset,
    FileDataset,
    SupervisedData
)



### Synthetic data format:

In [3]:
synth = SyntheticData(
trust=np.random.uniform(size=(4,)),
num_variables=100,
claim_probability=0.75,
domain_size=20)
synth = synth.to_csv()
data = synth.split('\n')
print(data)

['8.0,9.0,6.0,1.0,18.0,18.0,9.0,1.0,19.0,18.0,12.0,19.0,14.0,9.0,5.0,5.0,9.0,16.0,14.0,5.0,7.0,7.0,18.0,5.0,18.0,6.0,2.0,15.0,6.0,16.0,2.0,2.0,16.0,14.0,19.0,9.0,1.0,5.0,17.0,0.0,17.0,17.0,16.0,7.0,7.0,10.0,12.0,17.0,8.0,9.0,0.0,6.0,6.0,16.0,8.0,16.0,18.0,1.0,2.0,2.0,15.0,8.0,5.0,12.0,19.0,10.0,18.0,10.0,14.0,14.0,8.0,8.0,18.0,13.0,3.0,7.0,18.0,4.0,0.0,12.0,3.0,19.0,17.0,9.0,1.0,2.0,8.0,16.0,13.0,15.0,13.0,14.0,3.0,1.0,10.0,0.0,5.0,10.0,18.0,16.0', '8.0,9.0,6.0,1.0,18.0,5.0,9.0,9.0,5.0,,10.0,19.0,14.0,10.0,,,14.0,16.0,12.0,5.0,7.0,15.0,1.0,14.0,18.0,17.0,,15.0,17.0,16.0,14.0,2.0,16.0,10.0,,,1.0,5.0,17.0,17.0,17.0,14.0,16.0,,3.0,,,17.0,10.0,1.0,0.0,,6.0,16.0,8.0,15.0,14.0,1.0,5.0,,15.0,3.0,,5.0,19.0,3.0,,4.0,,17.0,,1.0,8.0,13.0,,2.0,18.0,4.0,7.0,12.0,3.0,10.0,18.0,19.0,1.0,17.0,8.0,0.0,0.0,12.0,13.0,9.0,18.0,2.0,2.0,12.0,5.0,5.0,,16.0', ',9.0,6.0,1.0,18.0,18.0,9.0,,19.0,18.0,12.0,,14.0,9.0,5.0,,9.0,16.0,14.0,5.0,7.0,7.0,18.0,5.0,18.0,18.0,2.0,,6.0,16.0,2.0,2.0,16.0,,,9.0,1.0,5.0,,0.0,17

### Processed sytnthetic data format:

In [4]:
for _ in range(100):
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=0.25,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

print('Ground truths:')
print(ground_truths)
print()
print("Claims:")
print(claims)

Ground truths:
{'cell 1': '2.0', 'cell 2': '5.0', 'cell 3': '13.0', 'cell 4': '15.0', 'cell 5': '8.0', 'cell 6': '2.0', 'cell 7': '4.0', 'cell 8': '5.0', 'cell 9': '12.0', 'cell 10': '14.0', 'cell 11': '9.0', 'cell 12': '5.0', 'cell 13': '15.0', 'cell 14': '18.0', 'cell 15': '14.0', 'cell 16': '19.0', 'cell 17': '5.0', 'cell 18': '17.0', 'cell 19': '18.0', 'cell 20': '1.0', 'cell 21': '5.0', 'cell 22': '18.0', 'cell 23': '17.0', 'cell 24': '15.0', 'cell 25': '10.0', 'cell 26': '16.0', 'cell 27': '0.0', 'cell 28': '1.0', 'cell 29': '15.0', 'cell 30': '16.0', 'cell 31': '1.0', 'cell 32': '17.0', 'cell 33': '7.0', 'cell 34': '17.0', 'cell 35': '13.0', 'cell 36': '10.0', 'cell 37': '18.0', 'cell 38': '6.0', 'cell 39': '19.0', 'cell 40': '8.0', 'cell 41': '0.0', 'cell 42': '8.0', 'cell 43': '9.0', 'cell 44': '17.0', 'cell 45': '1.0', 'cell 46': '17.0', 'cell 47': '8.0', 'cell 48': '18.0', 'cell 49': '16.0', 'cell 50': '0.0', 'cell 51': '8.0', 'cell 52': '5.0', 'cell 53': '7.0', 'cell 54': '

### Average algorithm accuracy testing (each result is an average of 100 runs)
(choose any algorithm from the imported ones for the algorithm variable)

In [None]:
for r in range(100):
    iters = 0
    accuracy_score = []
    total = 0
    while iters<101:
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=1.0,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

        algorithm = TruthFinder(initial_trust=0.1, influence_param = 0.0)
        mydata = Dataset(claims)
        supervised = SupervisedData(mydata, ground_truths)
        results = algorithm.run(supervised.data)
        accuracy = supervised.get_accuracy(results)
        accuracy_score.append(accuracy)
        iters +=1

    average_accuracy = sum(accuracy_score)/len(accuracy_score)
    print(average_accuracy)

### Non-average accuracy testing, 1000 runs accuracy measurement
(choose any algorithm from the imported ones for the algorithm variable)

In [None]:
 for _ in range(1000):
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=1.0,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

        algorithm = Investment()
        mydata = Dataset(claims)
        supervised = SupervisedData(mydata, ground_truths)
        results = algorithm.run(supervised.data)
        accuracy = supervised.get_accuracy(results)
        print(accuracy)

### Results data was collected and compared using boxplots in Excel