In [13]:
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
import matplotlib.pyplot as plt

In [14]:
# read tsv with headers
with open("./exports/crowd_data.tsv", "r") as f:
    crowd_df = pd.read_csv(f, sep="\t")

crowd_df.head()

Unnamed: 0,HITId,HITTypeId,Title,Reward,AssignmentId,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
0,1,7QT,Is this triple correct or incorrect?,$0.50,1,2133ICYWE97,Submitted,60,99%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,Is this triple correct or incorrect?,$0.50,2,2133U7HKDLO,Submitted,40,40%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
2,1,7QT,Is this triple correct or incorrect?,$0.50,3,928UJANWZ12,Submitted,50,98%,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
3,1,7QT,Is this triple correct or incorrect?,$0.50,4,1726JMZQW,Submitted,80,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
4,1,7QT,Is this triple correct or incorrect?,$0.50,5,2134U7HKDMM,Submitted,2,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,


In [15]:
summary_data = []

for hit_id, group in crowd_df.groupby("HITId"):
    total_correct = group[group["AnswerLabel"] == "CORRECT"].shape[0]
    total_incorrect = group[group["AnswerLabel"] == "INCORRECT"].shape[0]
    corrections = group[["FixPosition", "FixValue"]].dropna().to_dict(orient="records")
    summary_data.append({
        "HITId": hit_id,
        "Votes_CORRECT": total_correct,
        "Votes_INCORRECT": total_incorrect,
        "Corrections": corrections
    })

summary_df = pd.DataFrame(summary_data)
summary_df


Unnamed: 0,HITId,Votes_CORRECT,Votes_INCORRECT,Corrections
0,1,4,1,"[{'FixPosition': 'yes', 'FixValue': 'yes'}]"
1,2,4,1,"[{'FixPosition': 'yes', 'FixValue': 'yes'}]"
2,3,3,2,"[{'FixPosition': 'Object', 'FixValue': '2014-0..."
3,4,2,3,"[{'FixPosition': 'yes', 'FixValue': 'yes'}]"
4,5,4,1,"[{'FixPosition': 'yes', 'FixValue': 'yes'}]"
...,...,...,...,...
56,57,3,2,"[{'FixPosition': 'Object', 'FixValue': 'Q18190..."
57,58,1,4,"[{'FixPosition': 'Object', 'FixValue': 'Q73608..."
58,59,3,2,"[{'FixPosition': 'Object', 'FixValue': 'Q74884..."
59,60,2,3,"[{'FixPosition': 'Object', 'FixValue': 'Q884'}..."


In [16]:
def aggregate_workers(data):
    data['LifetimeApprovalRate'] = data['LifetimeApprovalRate'].astype(str).str.rstrip('%').astype(float)

    worker_stats = data.groupby('WorkerId').agg({
        'LifetimeApprovalRate': ['mean'],
        'WorkTimeInSeconds': ['mean', 'std', 'median']
    })

    return worker_stats

worker_df = aggregate_workers(crowd_df)
worker_df

Unnamed: 0_level_0,LifetimeApprovalRate,WorkTimeInSeconds,WorkTimeInSeconds,WorkTimeInSeconds
Unnamed: 0_level_1,mean,mean,std,median
WorkerId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1726JMZQW,70.0,188.52381,86.826044,240.0
2133ICYWE97,99.0,108.571429,28.685487,120.0
2133U7HKDLO,40.0,41.0,3.03315,40.0
2134U7HKDMM,70.0,2.0,0.0,2.0
928UJANWZ12,98.0,146.0,91.913002,80.0
AALKMII97,98.0,240.0,0.0,240.0
AALKMII98,98.0,244.05,21.93885,240.0
GGUI83657S,85.0,120.0,0.0,120.0
HHCKW1111,80.0,201.7,15.894201,200.0
LPQMUDT6729,40.0,4.0,0.0,4.0


In [17]:
worker_data = []

for worker_id, group in crowd_df.groupby("WorkerId"):
    total_questions = group.shape[0]
    answers = group[["HITId", "AnswerLabel", "WorkTimeInSeconds"]].to_dict(orient="records")
    total_time = group["WorkTimeInSeconds"].sum()
    worker_data.append({
        "WorkerId": worker_id,
        "Total_Questions_Answered": total_questions,
        "Total_Time_Spent": total_time,
        "Answers": answers
    })

worker_df = pd.DataFrame(worker_data)
worker_df

Unnamed: 0,WorkerId,Total_Questions_Answered,Total_Time_Spent,Answers
0,1726JMZQW,21,3959,"[{'HITId': 1, 'AnswerLabel': 'CORRECT', 'WorkT..."
1,2133ICYWE97,21,2280,"[{'HITId': 1, 'AnswerLabel': 'CORRECT', 'WorkT..."
2,2133U7HKDLO,21,861,"[{'HITId': 1, 'AnswerLabel': 'CORRECT', 'WorkT..."
3,2134U7HKDMM,21,42,"[{'HITId': 1, 'AnswerLabel': 'CORRECT', 'WorkT..."
4,928UJANWZ12,21,3066,"[{'HITId': 1, 'AnswerLabel': 'INCORRECT', 'Wor..."
5,AALKMII97,20,4800,"[{'HITId': 42, 'AnswerLabel': 'INCORRECT', 'Wo..."
6,AALKMII98,20,4881,"[{'HITId': 22, 'AnswerLabel': 'INCORRECT', 'Wo..."
7,GGUI83657S,40,4800,"[{'HITId': 22, 'AnswerLabel': 'INCORRECT', 'Wo..."
8,HHCKW1111,40,8068,"[{'HITId': 22, 'AnswerLabel': 'CORRECT', 'Work..."
9,LPQMUDT6729,20,80,"[{'HITId': 22, 'AnswerLabel': 'INCORRECT', 'Wo..."


In [18]:
duplicate_answers = crowd_df[crowd_df.duplicated(subset=["WorkerId", "HITId"], keep=False)]
duplicate_summary = duplicate_answers.groupby(["WorkerId", "HITId"]).size().reset_index(name="Duplicate_Count")

if not duplicate_summary.empty:
    print("Workers who answered the same question more than once:")
else:
    print("No workers answered the same question more than once.")


No workers answered the same question more than once.


In [19]:

summary_data = []

for hit_id, group in crowd_df.groupby("HITId"):
    total_correct = group[group["AnswerLabel"] == "CORRECT"].shape[0]
    total_incorrect = group[group["AnswerLabel"] == "INCORRECT"].shape[0]

    corrections_subject = group[group["FixPosition"] == "Subject"]["FixValue"].dropna().tolist()
    corrections_object = group[group["FixPosition"] == "Object"]["FixValue"].dropna().tolist()
    corrections_predicate = group[group["FixPosition"] == "Predicate"]["FixValue"].dropna().tolist()

    if any([corrections_predicate, corrections_object, corrections_subject]):
        summary_data.append({
            "HITId": hit_id,
            "Votes_CORRECT": total_correct,
            "Votes_INCORRECT": total_incorrect,
            "Corrections_Subject": list(set(corrections_subject)),
            "Corrections_Object": list(set(corrections_object)),
            "Corrections_Predicate": list(set(corrections_predicate))
        })

summary_df = pd.DataFrame(summary_data)
summary_df


Unnamed: 0,HITId,Votes_CORRECT,Votes_INCORRECT,Corrections_Subject,Corrections_Object,Corrections_Predicate
0,3,3,2,[],[2014-02-18],[]
1,6,3,2,[],[2019-02-24],[]
2,8,2,3,[],[698491348],[]
3,11,3,2,[],[wd:Q72],[]
4,13,4,1,[],[wd:Q94074],[]
5,17,2,3,[],[wd:Q95073],[]
6,22,2,3,[],[2010-01-01],[]
7,23,0,5,[],[2011-01-01],[]
8,24,1,4,[],[176997168],[]
9,25,4,1,[],[1992-01-01],[]


In [20]:

correction_dict = {}

for hit_id, group in crowd_df.groupby("HITId"):
    original_subject = group["Input1ID"].iloc[0]
    original_predicate = group["Input2ID"].iloc[0]
    original_object = group["Input3ID"].iloc[0]

    corrections_subject = group[group["FixPosition"] == "Subject"]["FixValue"].dropna().tolist()
    corrections_object = group[group["FixPosition"] == "Object"]["FixValue"].dropna().tolist()
    corrections_predicate = group[group["FixPosition"] == "Predicate"]["FixValue"].dropna().tolist()

    def validate_corrections(corrections_, position):
        if len(set(corrections_)) > 1:
            raise ValueError(f"Inconsistent corrections for {position} in HIT {hit_id}: {corrections_}")
        return corrections_[0] if corrections_ else None

    corrected_subject = validate_corrections(corrections_subject, "Subject") if corrections_subject else original_subject
    corrected_predicate = validate_corrections(corrections_predicate, "Predicate") if corrections_predicate else original_predicate
    corrected_object = validate_corrections(corrections_object, "Object") if corrections_object else original_object

    correction_dict[(original_subject, original_predicate, original_object)] = (
        corrected_subject,
        corrected_predicate,
        corrected_object
    )

def search_triple(subject, predicate, object_):
    key = (subject, predicate, object_)
    return correction_dict.get(key, "No corrections found")

# Example where there exists a correction
example_subject = "wd:Q223596"
example_predicate = "wdt:P1431"
example_object = "wd:Q457180"

try:
    result = search_triple(example_subject, example_predicate, example_object)
    print(f"Corrected triple for ({example_subject}, {example_predicate}, {example_object}):")
    print(result)
except ValueError as e:
    print(f"Error: {e}")


Corrected triple for (wd:Q223596, wdt:P1431, wd:Q457180):
('wd:Q223596', 'wdt:P1431', 'Q181900')
