In [207]:
import numpy as np
import pandas as pd
import json

from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
import matplotlib.pyplot as plt

In [208]:
# read tsv with headers
with open("./exports/crowd_data.tsv", "r") as f:
    crowd_df = pd.read_csv(f, sep="\t", index_col="AssignmentId")
    crowd_df = crowd_df.sort_values(by=["HITId", "WorkerId"])

crowd_df

Unnamed: 0_level_0,HITId,HITTypeId,Title,Reward,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4,1,7QT,Is this triple correct or incorrect?,$0.50,1726JMZQW,Submitted,80,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,Is this triple correct or incorrect?,$0.50,2133ICYWE97,Submitted,60,99%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
2,1,7QT,Is this triple correct or incorrect?,$0.50,2133U7HKDLO,Submitted,40,40%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
5,1,7QT,Is this triple correct or incorrect?,$0.50,2134U7HKDMM,Submitted,2,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
3,1,7QT,Is this triple correct or incorrect?,$0.50,928UJANWZ12,Submitted,50,98%,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,61,9QT,Is this triple correct or incorrect?,$0.50,AALKMII97,Submitted,240,98%,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,Object,Q1860
303,61,9QT,Is this triple correct or incorrect?,$0.50,GGUI83657S,Submitted,120,85%,wd:Q1288004,wdt:P1412,wd:Q13330,1.0,CORRECT,,
302,61,9QT,Is this triple correct or incorrect?,$0.50,HHCKW1111,Submitted,200,80%,wd:Q1288004,wdt:P1412,wd:Q13330,1.0,CORRECT,,
305,61,9QT,Is this triple correct or incorrect?,$0.50,QZAHIFT8263,Submitted,2,40%,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,100,100


In [209]:
crowd_df = crowd_df.drop(columns=["Title", "Reward", "AssignmentStatus"])
crowd_df

Unnamed: 0_level_0,HITId,HITTypeId,WorkerId,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4,1,7QT,1726JMZQW,80,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,2133ICYWE97,60,99%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
2,1,7QT,2133U7HKDLO,40,40%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
5,1,7QT,2134U7HKDMM,2,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
3,1,7QT,928UJANWZ12,50,98%,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
...,...,...,...,...,...,...,...,...,...,...,...,...
301,61,9QT,AALKMII97,240,98%,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,Object,Q1860
303,61,9QT,GGUI83657S,120,85%,wd:Q1288004,wdt:P1412,wd:Q13330,1.0,CORRECT,,
302,61,9QT,HHCKW1111,200,80%,wd:Q1288004,wdt:P1412,wd:Q13330,1.0,CORRECT,,
305,61,9QT,QZAHIFT8263,2,40%,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,100,100


In [210]:
crowd_df['LifetimeApprovalRate'] = crowd_df['LifetimeApprovalRate'].astype(str).str.rstrip('%').astype(float)

median_rate = crowd_df['LifetimeApprovalRate'].median()

correction = 0
crowd_df = crowd_df[(crowd_df['LifetimeApprovalRate'] >= median_rate - correction)]

print(f"Median lifetime approval rate: {int(median_rate)}%")

crowd_df

Median lifetime approval rate: 80%


Unnamed: 0_level_0,HITId,HITTypeId,WorkerId,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,7QT,2133ICYWE97,60,99.0,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
3,1,7QT,928UJANWZ12,50,98.0,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
7,2,7QT,2133ICYWE97,120,99.0,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,,
8,2,7QT,928UJANWZ12,60,98.0,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,,
11,3,7QT,2133ICYWE97,140,99.0,wd:Q16911843,wdt:P577,2014-01-18,2.0,INCORRECT,Object,2014-02-18
...,...,...,...,...,...,...,...,...,...,...,...,...
298,60,9QT,GGUI83657S,120,85.0,wd:Q21060270,wdt:P27,wd:Q916,1.0,CORRECT,,
297,60,9QT,HHCKW1111,200,80.0,wd:Q21060270,wdt:P27,wd:Q916,1.0,CORRECT,,
301,61,9QT,AALKMII97,240,98.0,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,Object,Q1860
303,61,9QT,GGUI83657S,120,85.0,wd:Q1288004,wdt:P1412,wd:Q13330,1.0,CORRECT,,


In [211]:
median_rate = crowd_df['WorkTimeInSeconds'].median()

correction = 0
crowd_df = crowd_df[(crowd_df['WorkTimeInSeconds'] >= median_rate - correction)]

print(f"Median work time in seconds: {int(median_rate)}s")
crowd_df

Median work time in seconds: 200s


Unnamed: 0_level_0,HITId,HITTypeId,WorkerId,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17,4,7QT,928UJANWZ12,240,98.0,wd:Q132863,wdt:P2142,969023261,2.0,INCORRECT,,
23,5,7QT,928UJANWZ12,240,98.0,wd:Q1628022,wdt:P577,1951-01-01,1.0,CORRECT,,
37,8,7QT,928UJANWZ12,240,98.0,wd:Q217010,wdt:P2142,698491347,2.0,INCORRECT,Object,698491348
52,11,7QT,928UJANWZ12,240,98.0,wd:Q1339195,ddis:indirectSubclassOf,wd:Q27096213,2.0,INCORRECT,Object,wd:Q72
57,12,7QT,928UJANWZ12,240,98.0,wd:Q104649845,ddis:indirectSubclassOf,wd:Q43229,1.0,CORRECT,,
...,...,...,...,...,...,...,...,...,...,...,...,...
292,59,9QT,HHCKW1111,200,80.0,wd:Q1893555,wdt:P272,wd:Q48784114,2.0,INCORRECT,Object,Q7488442
296,60,9QT,AALKMII97,240,98.0,wd:Q21060270,wdt:P27,wd:Q916,2.0,INCORRECT,Object,Q884
297,60,9QT,HHCKW1111,200,80.0,wd:Q21060270,wdt:P27,wd:Q916,1.0,CORRECT,,
301,61,9QT,AALKMII97,240,98.0,wd:Q1288004,wdt:P1412,wd:Q13330,2.0,INCORRECT,Object,Q1860


In [212]:
crowd_df = crowd_df.drop(columns=["WorkTimeInSeconds", "LifetimeApprovalRate"])

In [213]:
filtered_df = crowd_df.groupby("HITId").filter(
    lambda group: group[group["AnswerLabel"] == "INCORRECT"].shape[0] >
                  group[group["AnswerLabel"] == "CORRECT"].shape[0]
)
filtered_df

Unnamed: 0_level_0,HITId,HITTypeId,WorkerId,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17,4,7QT,928UJANWZ12,wd:Q132863,wdt:P2142,969023261,2.0,INCORRECT,,
37,8,7QT,928UJANWZ12,wd:Q217010,wdt:P2142,698491347,2.0,INCORRECT,Object,698491348
52,11,7QT,928UJANWZ12,wd:Q1339195,ddis:indirectSubclassOf,wd:Q27096213,2.0,INCORRECT,Object,wd:Q72
67,14,7QT,928UJANWZ12,wd:Q1004,ddis:indirectSubclassOf,wd:Q47461344,2.0,INCORRECT,,
97,20,7QT,928UJANWZ12,wd:Q15239622,ddis:indirectSubclassOf,wd:Q27096213,2.0,INCORRECT,,
106,22,8QT,AALKMII98,wd:Q1720855,wdt:P577,2010-10-01,2.0,INCORRECT,Object,2010-01-01
111,23,8QT,AALKMII98,wd:Q598752,wdt:P577,2001-01-01,2.0,INCORRECT,Object,2011-01-01
112,23,8QT,HHCKW1111,wd:Q598752,wdt:P577,2001-01-01,2.0,INCORRECT,Object,2011-01-01
127,26,8QT,AALKMII98,wd:Q1410031,wdt:P577,2010-10-01,2.0,INCORRECT,Object,2011-01-01
126,26,8QT,HHCKW1111,wd:Q1410031,wdt:P577,2010-10-01,2.0,INCORRECT,Object,2011-01-01


In [214]:
json_files = ["./exports/people_db.json", "./exports/movie_db.json"]
merged_data = {
    k.split("/")[-1]: v
    for file_path in json_files
    for k, v in json.load(open(file_path, 'r')).items()
}

with open("./exports/predicate_db.json", 'r') as f:
    # Reversed index
    merged_data.update({v.split("/")[-1]: k for k, v in json.load(f).items()})

def set_label(value, required=True):
    value = str(value)
    if "/" in value:
        value = value.split("/")[-1]
    elif ":" in value:
        value = value.split(":")[-1]
    return merged_data.get(value, np.nan if required else value)

for column in ["Input1ID", "Input2ID"]:
    filtered_df[column] = filtered_df[column].apply(set_label, required=True)

for column in ["Input3ID", "FixValue"]:
    filtered_df[column] = filtered_df[column].apply(set_label, required=False)


filtered_df = filtered_df.dropna(subset=["Input1ID", "Input2ID", "Input3ID"])

filtered_df

Unnamed: 0_level_0,HITId,HITTypeId,WorkerId,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
AssignmentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17,4,7QT,928UJANWZ12,finding nemo,box office,969023261,2.0,INCORRECT,,
37,8,7QT,928UJANWZ12,the twilight saga eclipse,box office,698491347,2.0,INCORRECT,Object,698491348
106,22,8QT,AALKMII98,kaboom,publication date,2010-10-01,2.0,INCORRECT,Object,2010-01-01
111,23,8QT,AALKMII98,like crazy,publication date,2001-01-01,2.0,INCORRECT,Object,2011-01-01
112,23,8QT,HHCKW1111,like crazy,publication date,2001-01-01,2.0,INCORRECT,Object,2011-01-01
127,26,8QT,AALKMII98,tom meets zizou,publication date,2010-10-01,2.0,INCORRECT,Object,2011-01-01
126,26,8QT,HHCKW1111,tom meets zizou,publication date,2010-10-01,2.0,INCORRECT,Object,2011-01-01
141,29,8QT,AALKMII98,magic carpet ride,cast member,arthur dupont,2.0,INCORRECT,Object,ezgi mola
142,29,8QT,HHCKW1111,magic carpet ride,cast member,arthur dupont,2.0,INCORRECT,Object,ezgi mola
166,34,8QT,AALKMII98,martial arts of shaolin,cast member,peter breitmayer,2.0,INCORRECT,Subject,horrible bosses


In [215]:
duplicate_answers = crowd_df[crowd_df.duplicated(subset=["WorkerId", "HITId"], keep=False)]
duplicate_summary = duplicate_answers.groupby(["WorkerId", "HITId"]).size().reset_index(name="Duplicate_Count")

if not duplicate_summary.empty:
    print("Workers who answered the same question more than once:")
else:
    print("No workers answered the same question more than once.")


No workers answered the same question more than once.


In [221]:
result = (
    filtered_df.groupby(["Input1ID", "Input2ID", "Input3ID"])
    .agg(
        Votes_CORRECT=("AnswerLabel", lambda x: (x == "CORRECT").sum()),
        Votes_INCORRECT=("AnswerLabel", lambda x: (x == "INCORRECT").sum()),
        Corrections_Subject=("FixValue", lambda x: x[filtered_df["FixPosition"] == "Subject"].tolist()),
        Corrections_Predicate=("FixValue", lambda x: x[filtered_df["FixPosition"] == "Predicate"].tolist()),
        Corrections_Object=("FixValue", lambda x: x[filtered_df["FixPosition"] == "Object"].tolist()),
    )
    .reset_index()
)

def ensure_single_value(corrections):
    unique_values = list(set([v for v in corrections if pd.notna(v) and v != "nan"]))
    if len(unique_values) > 1:
        raise ValueError(f"Multiple values found where a single value is expected: {unique_values}")
    return unique_values[0] if unique_values else np.nan  # Return the single value or None if the list is empty

# Apply the function to ensure a single value per correction
result["Corrections_Subject"] = result["Corrections_Subject"].apply(ensure_single_value)
result["Corrections_Predicate"] = result["Corrections_Predicate"].apply(ensure_single_value)
result["Corrections_Object"] = result["Corrections_Object"].apply(ensure_single_value)

result

Unnamed: 0,Input1ID,Input2ID,Input3ID,Votes_CORRECT,Votes_INCORRECT,Corrections_Subject,Corrections_Predicate,Corrections_Object
0,a happy event,cast member,theis schmidt,0,2,,,erika sainte
1,a night at the opera,cast member,jude law,0,2,,,rolfe sedan
2,conan the barbarian,voice actor,rose mcgowan,0,2,,cast member,
3,eskimo,production designer,john lee mahin,0,2,,screenwriter,
4,finding nemo,box office,969023261,0,1,,,
5,iron man 3,jmk film rating,Q28732983,0,2,,,Q28732985
6,kaboom,publication date,2010-10-01,0,1,,,2010-01-01
7,like crazy,publication date,2001-01-01,0,2,,,2011-01-01
8,magic carpet ride,cast member,arthur dupont,0,2,,,ezgi mola
9,martial arts of shaolin,cast member,peter breitmayer,0,2,horrible bosses,,


In [222]:
result.dropna(subset=["Corrections_Subject", "Corrections_Predicate", "Corrections_Object"])
result

Unnamed: 0,Input1ID,Input2ID,Input3ID,Votes_CORRECT,Votes_INCORRECT,Corrections_Subject,Corrections_Predicate,Corrections_Object
0,a happy event,cast member,theis schmidt,0,2,,,erika sainte
1,a night at the opera,cast member,jude law,0,2,,,rolfe sedan
2,conan the barbarian,voice actor,rose mcgowan,0,2,,cast member,
3,eskimo,production designer,john lee mahin,0,2,,screenwriter,
4,finding nemo,box office,969023261,0,1,,,
5,iron man 3,jmk film rating,Q28732983,0,2,,,Q28732985
6,kaboom,publication date,2010-10-01,0,1,,,2010-01-01
7,like crazy,publication date,2001-01-01,0,2,,,2011-01-01
8,magic carpet ride,cast member,arthur dupont,0,2,,,ezgi mola
9,martial arts of shaolin,cast member,peter breitmayer,0,2,horrible bosses,,
