In [2]:
import numpy as np
from truthdiscovery import (
    AverageLog,
    ConvergenceIterator,
    FixedIterator,
    Investment,
    MajorityVoting,
    PooledInvestment,
    Sums,
    TruthFinder,
    SyntheticData,
    Dataset,
    FileDataset,
    SupervisedData
)



### Synthetic data format:

In [8]:
synth = SyntheticData(
trust=np.random.uniform(size=(4,)),
num_variables=100,
claim_probability=0.75,
domain_size=20)
synth = synth.to_csv()
data = synth.split('\n')
print(data)

['16.0,18.0,18.0,6.0,12.0,0.0,6.0,3.0,2.0,14.0,17.0,0.0,15.0,19.0,3.0,3.0,0.0,4.0,3.0,0.0,16.0,4.0,9.0,13.0,5.0,17.0,9.0,18.0,18.0,10.0,0.0,17.0,19.0,9.0,13.0,2.0,8.0,12.0,9.0,4.0,14.0,8.0,7.0,17.0,2.0,14.0,5.0,4.0,13.0,2.0,19.0,8.0,13.0,19.0,6.0,11.0,9.0,8.0,0.0,0.0,10.0,5.0,13.0,7.0,15.0,5.0,9.0,12.0,11.0,3.0,18.0,9.0,5.0,4.0,15.0,0.0,13.0,18.0,7.0,14.0,17.0,1.0,17.0,6.0,18.0,8.0,7.0,7.0,5.0,2.0,3.0,14.0,6.0,2.0,5.0,0.0,9.0,8.0,9.0,15.0', '16.0,12.0,16.0,0.0,16.0,12.0,10.0,,2.0,19.0,16.0,11.0,0.0,,4.0,3.0,16.0,10.0,0.0,0.0,10.0,4.0,13.0,17.0,5.0,,12.0,,18.0,10.0,19.0,17.0,18.0,9.0,7.0,11.0,2.0,18.0,9.0,7.0,2.0,15.0,,4.0,15.0,8.0,8.0,3.0,,2.0,1.0,8.0,2.0,18.0,13.0,3.0,9.0,2.0,3.0,,,14.0,0.0,7.0,18.0,14.0,,12.0,2.0,,,8.0,,0.0,12.0,3.0,,5.0,7.0,3.0,5.0,1.0,17.0,7.0,,8.0,,8.0,5.0,12.0,19.0,,,9.0,17.0,,15.0,8.0,14.0,12.0', ',,9.0,14.0,5.0,2.0,9.0,12.0,2.0,14.0,,0.0,15.0,11.0,3.0,3.0,8.0,19.0,10.0,,16.0,,19.0,,19.0,19.0,9.0,,,,12.0,19.0,17.0,4.0,13.0,16.0,17.0,2.0,16.0,4.0,,6.0,12.0,,2.0,1

### Processed sytnthetic data format:

In [9]:
for _ in range(100):
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=0.25,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

print('Ground truths:')
print(ground_truths)
print()
print("Claims:")
print(claims)

Ground truths:
{'cell 1': '9.0', 'cell 2': '15.0', 'cell 3': '9.0', 'cell 4': '17.0', 'cell 5': '18.0', 'cell 6': '16.0', 'cell 7': '8.0', 'cell 8': '14.0', 'cell 9': '11.0', 'cell 10': '14.0', 'cell 11': '10.0', 'cell 12': '10.0', 'cell 13': '7.0', 'cell 14': '19.0', 'cell 15': '11.0', 'cell 16': '18.0', 'cell 17': '11.0', 'cell 18': '6.0', 'cell 19': '18.0', 'cell 20': '17.0', 'cell 21': '16.0', 'cell 22': '14.0', 'cell 23': '10.0', 'cell 24': '7.0', 'cell 25': '8.0', 'cell 26': '6.0', 'cell 27': '3.0', 'cell 28': '13.0', 'cell 29': '3.0', 'cell 30': '8.0', 'cell 31': '19.0', 'cell 32': '7.0', 'cell 33': '1.0', 'cell 34': '19.0', 'cell 35': '17.0', 'cell 36': '0.0', 'cell 37': '4.0', 'cell 38': '3.0', 'cell 39': '4.0', 'cell 40': '18.0', 'cell 41': '15.0', 'cell 42': '11.0', 'cell 43': '12.0', 'cell 44': '6.0', 'cell 45': '9.0', 'cell 46': '3.0', 'cell 47': '10.0', 'cell 48': '8.0', 'cell 49': '19.0', 'cell 50': '6.0', 'cell 51': '5.0', 'cell 52': '11.0', 'cell 53': '0.0', 'cell 54':

### Average algorithm accuracy testing (each result is an average of 100 runs)
(choose any algorithm from the imported ones for the algorithm variable)

In [3]:
for r in range(100):
    iters = 0
    accuracy_score = []
    total = 0
    while iters<101:
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=1.0,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

        algorithm = TruthFinder(initial_trust=0.1, influence_param = 0.0)
        mydata = Dataset(claims)
        supervised = SupervisedData(mydata, ground_truths)
        results = algorithm.run(supervised.data)
        accuracy = supervised.get_accuracy(results)
        accuracy_score.append(accuracy)
        iters +=1

    average_accuracy = sum(accuracy_score)/len(accuracy_score)
    print(average_accuracy)

0.741134760211206
0.7325033930868163
0.7301516365087457
0.6972360301638355
0.7304126993266571
0.7138451947746672
0.7040208684139683
0.7339909463063699
0.7287621225746158
0.7187450976908492
0.70272174287803


KeyboardInterrupt: 

### Non-average accuracy testing, 1000 runs accuracy measurement
(choose any algorithm from the imported ones for the algorithm variable)

In [4]:
 for _ in range(1000):
        #random cell type data synthesis
        synth = SyntheticData(
            trust=np.random.uniform(size=(4,)),
            num_variables=100,
            claim_probability=1.0,
            domain_size=20
        )
        synth = synth.to_csv()
        data = synth.split('\n')

        #data processing
        index = 0
        ground_truths = {}
        claims = []
        for n in data:
            if index == 0:
                for i, cell_type_t in enumerate(data[0].split(',')):
                    ground_truths[f'cell {i+1}'] = cell_type_t
            elif index >0:
                for z, cell_type in enumerate(data[index].split(',')):
                    claims.append((f'source {index}', f'cell {z+1}', cell_type))
            index+=1

        algorithm = Investment()
        mydata = Dataset(claims)
        supervised = SupervisedData(mydata, ground_truths)
        results = algorithm.run(supervised.data)
        accuracy = supervised.get_accuracy(results)
        print(accuracy)

0.84375
0.68
0.6129032258064516
0.7551020408163265
0.788235294117647
0.9895833333333334
0.9361702127659575
0.83
0.9595959595959596
0.9393939393939394
0.95
0.9886363636363636
0.73
0.21
0.95
0.87
0.9896907216494846
0.8172043010752689
0.29
0.9175257731958762
0.75
0.97
0.8
0.77
0.9375
0.8658536585365854
0.41836734693877553
0.8
0.865979381443299
0.8181818181818182
0.9310344827586207
0.89
0.8673469387755102
0.8673469387755102
0.9479166666666666
1.0
0.23
0.79
0.76
0.8297872340425532
0.1
0.8170731707317073
0.62
0.6666666666666666
0.99
0.8247422680412371
0.7171717171717171
0.87
0.9666666666666667
1.0
0.7959183673469388
0.9361702127659575
0.7604166666666666
0.9484536082474226
0.8777777777777778
1.0
0.9292929292929293
0.8125
0.9292929292929293
0.8131868131868132
1.0
0.8021978021978022
0.8958333333333334
0.82
0.9090909090909091
0.9795918367346939
0.8484848484848485
0.6767676767676768
0.9555555555555556
0.78
0.49
0.9897959183673469
0.97
1.0
0.6161616161616161
0.98989898989899
0.5670103092783505
0.6

0.39
0.53
0.5959595959595959
0.9032258064516129
0.7727272727272727
0.9883720930232558
0.51
0.91
0.953125
0.7551020408163265
0.5858585858585859
0.8645833333333334
0.98
1.0
0.38
0.7575757575757576
0.95
1.0
0.6777777777777778
0.48484848484848486
0.94
0.72
0.62
0.93
0.8080808080808081
0.8526315789473684
0.62
0.8
0.96
0.9130434782608695
1.0
0.40816326530612246
0.979381443298969
0.7272727272727273
0.52
0.9591836734693877
0.9753086419753086
0.9148936170212766
0.91
0.89
0.9176470588235294
0.7525773195876289
0.9487179487179487
0.8735632183908046
0.9240506329113924
0.6701030927835051
0.8571428571428571
0.8666666666666667
0.9896907216494846
0.9342105263157895
1.0
0.8041237113402062
0.7
0.6210526315789474
0.7738095238095238
0.9381443298969072
0.797979797979798
0.7628865979381443
0.8229166666666666
0.33
0.93
0.9154929577464789
0.9183673469387755
0.8369565217391305
0.51
0.9852941176470589
0.8282828282828283
0.8333333333333334
0.9797979797979798
0.8351648351648352
0.7634408602150538
0.72
0.8764044943

### Results data was collected and compared using boxplots in Excel