 # Progetto Social Computing no.2

In [1]:
from pathlib import Path
import os
import csv
import json
import random
import itertools
import math
from collections import defaultdict
from functools import reduce
from copy import deepcopy
from IPython.display import HTML, display
from typing import Any


## Init

In [2]:
hits_filename = "hits.json"
mturk_vars_filename = "mturk_variables.csv"

# Variabili come nel file .env
task_name = "SC_tasktest3"
batch_name = "SC_batchtest3"
aws_region = "us-east-1"
aws_deploy_bucket = "my-sc-project-bucket"

cwd = Path.cwd()
task_folder = cwd / "task"
data_dir = cwd / "result" / "SC_task1" / "Data"  # Un file json è un worker
dataframe_dir = cwd / "result" / "SC_task1" / "Dataframe"  # Risposte

hits_file = task_folder / hits_filename
mturk_vars_file = cwd / mturk_vars_filename
answers_file = dataframe_dir / "workers_answers.csv"

skip_mturk = False

if os.path.exists(hits_file):
    print(
        f"File {hits_filename} already exists. Skipping HITs generation and {hits_filename} generation"
    )
    skip_mturk = True


File hits.json already exists. Skipping HITs generation and hits.json generation


## Helpers

In [3]:
def float_atoi(x: str):
    return math.trunc(float(x))


def create_task_matrix(answers: dict[str, dict[int, dict[str, int]]], scale: str):
    task_matrix: dict[int, dict[str, int]] = defaultdict(lambda: dict())
    for (worker_id, docs) in answers.items():
        for (doc_id, scales) in docs.items():
            task_matrix[doc_id][worker_id] = scales[scale]
    return task_matrix


def calc_pairwise_agreement(
    pairs: list[tuple[str, str]], task_matrix: dict[int, dict[str, int]]
):
    pairwise_agreement: dict[int, dict[str, float]] = defaultdict(lambda: dict())
    for (task_id, answers) in task_matrix.items():
        agreement_count = 0
        for (w1, w2) in pairs:
            if answers[w1] == answers[w2]:
                agreement_count += 1
        pairwise_agreement[task_id]["pairwise agreement"] = round(
            agreement_count / len(pairs), 2
        )
    return pairwise_agreement


def calc_average_agreement(pairwise_agreement: dict[int, dict[str, float]]):
    return round(
        sum(
            [
                pairwise_agreement[x]["pairwise agreement"]
                for x in pairwise_agreement.keys()
            ]
        )
        / len(pairwise_agreement),
        2,
    )


def print_tables(*tables: dict[Any, dict[Any, Any]]):
    width_keys = tables[0].keys()
    html = f"<table><tr><td></td><td>{'</td><td>'.join(map(str,width_keys))}</td></tr>"
    for table in tables:
        height_keys = next(iter(table.values())).keys()
        html += (
            "<tr>"
            + "</tr><tr>".join(
                f"<td>{y}</td><td>"
                + "</td><td>".join(str(table[x][y]) for x in width_keys)
                + "</td>"
                for y in height_keys
            )
            + "</tr>"
        )
    html += "</table>"
    display(HTML(html))


## Generation

### Generate hits.json

In [4]:
if not skip_mturk:
    with open("group_3-cavasin_cimador_faion.csv", newline="") as f:
        statement_list = list(
            csv.reader(f, delimiter=",", quotechar='"')
        )  # Importa il dataset in una lista (si guarda dall'indice 1 in poi)

    chars = "QWERTYUIOPASDFGHJKLZXCVBNM"  # Per generare gli ID token
    dict_array = []
    choose_human_for = 1  # A quale statement viene assegnata explanation_human

    for i in range(0, 12):
        statements = []

        for statement_number in range(1, 4):  # Gli statement da 1 a 3
            # Si assegna explanation_human a solo uno statement su tre
            statements += [
                {
                    "id": statement_list[statement_number][0],
                    "statement": statement_list[statement_number][1],
                    "explanation": statement_list[statement_number][
                        2 if choose_human_for == statement_number else 3
                    ],  # explanation_human o explanation_model
                    "label": statement_list[statement_number][4],
                }
            ]
        choose_human_for = choose_human_for % 3 + 1
        random.shuffle(statements)  # Ordina i tre statement a caso
        dict_array += [
            {
                "unit_id": "unit_" + str(i),
                "token_input": "".join(random.sample(chars, 10)),
                "token_output": "".join(random.sample(chars, 10)),
                "documents_number": 3,
                "documents": statements,
            }
        ]

    with open(hits_file, "w") as json_file:
        json.dump(dict_array, json_file, indent=4)  # Serializza su file json
        print("File ", hits_filename, " generated")


### Generate variables for MTurk task

In [5]:
if not skip_mturk:
    with open(mturk_vars_file, "w") as mturk_vars:  # Genera variabili per mturk
        mturk_vars.write(
            "aws_deploy_bucket,aws_region,task_name,batch_name,tokens\n"
        )  # Colonne
        for i in range(0, 12):
            mturk_vars.write(
                f"{aws_deploy_bucket},{aws_region},{task_name},{batch_name},{dict_array[i].get('token_input')};{dict_array[i].get('token_output')}\n"
            )
        print("File ", mturk_vars_filename, " generated")


## Data Processing

### Calculate annotation percentage

In [6]:
docs = [0.0, 0.0, 0.0]  # Le quantità di testo annotato per ogni documento
workers = 0


def to_percent(number):
    return round(number * 100, 1)


for worker_file in data_dir.iterdir():
    with open(worker_file, encoding="utf8") as f:
        worker_data = json.load(f)
        print("Worker", worker_file.stem)
        try:
            for i in range(0, 3):  # Per ogni document
                annotations_data = worker_data[0]["data_partial"]["documents_answers"][
                    i
                ]["serialization"]["notes"][
                    -1
                ]  # Le informazioni di annotazione
                annotation_length = len(
                    annotations_data["raw_text"]
                )  # Lunghezza del testo annotato
                statement_length = (
                    annotation_length
                    + len(annotations_data["text_left"])
                    + len(annotations_data["text_right"])
                )  # Lunghezza dell'intero statement
                annotation_fraction = (
                    annotation_length / statement_length
                )  # Quantità di testo annotato
                print(
                    f"\tDocument {i}: {to_percent(annotation_fraction)}%",
                )
                docs[
                    i
                ] += annotation_fraction  # Aggiungi la quantità alla lista (servirà per calcolare la media)
            workers += 1
        except IndexError as error:  # Se il file json non contiene le informazioni giuste
            print("Worker", worker_file.name, "is empty")
        print("")

for i in range(0, 3):
    print(
        f"Average of Document {i}: {to_percent(docs[i] / workers)}%"
    )  # Calcola la media della quantità di testo annotato


Worker AQQPMWFVE949YW
Worker AQQPMWFVE949YW.json is empty

Worker B2PU87OELNL97V
	Document 0: 93.0%
	Document 1: 100.0%
	Document 2: 46.3%

Worker BM539KK2W9AIRX
Worker BM539KK2W9AIRX.json is empty

Worker HIOF5FD2FK3W9F
	Document 0: 17.1%
	Document 1: 21.3%
	Document 2: 50.2%

Worker IMDORB1186UKOF
	Document 0: 92.9%
	Document 1: 50.2%
	Document 2: 42.7%

Worker KFNXJP7XVSCPRX
	Document 0: 35.2%
	Document 1: 15.3%
	Document 2: 12.9%

Worker NMXJ0KIWOLKHNV
	Document 0: 10.9%
	Document 1: 88.1%
	Document 2: 42.3%

Worker OWAM2TV1SP4TU5
	Document 0: 14.7%
	Document 1: 92.9%
	Document 2: 18.7%

Worker RBG8K61JCTLLQD
	Document 0: 50.7%
	Document 1: 44.0%
	Document 2: 69.0%

Worker SKDKX9GZD4UH7L
Worker SKDKX9GZD4UH7L.json is empty

Worker TO5SRSVEVSUPUJ
	Document 0: 12.1%
	Document 1: 41.6%
	Document 2: 95.7%

Worker TWNMYWGHKNZTW0
	Document 0: 10.5%
	Document 1: 27.6%
	Document 2: 95.7%

Worker VZCC04FR6OASDY
Worker VZCC04FR6OASDY.json is empty

Worker XV8DEPZONNFP2D
Worker XV8DEPZONNFP2D

### Count annotations update

In [7]:
docs = [0.0, 0.0, 0.0]

for worker_file in data_dir.iterdir():
    with open(worker_file, encoding="utf8") as jsonfile:
        worker_data = json.load(jsonfile)
        print("Worker", worker_file.stem)
        try:
            for i in range(0, 3):  # Per ogni document
                annotations_update_data = len(
                    worker_data[0]["data_partial"]["documents_answers"][i][
                        "serialization"
                    ]["notes"]
                )
                print(
                    f"\tDocument {i}: annotation updated {annotations_update_data} time(s)",
                )
                docs[
                    i
                ] += annotations_update_data  # Quante volte vengono aggiornate le annotazioni per ogni documento

        except IndexError as error:
            print("Worker", worker_file.name, "is empty")
        print("")

for i in range(0, 3):
    print(f"Document {i} annotations updated {docs[i]} time(s)")


Worker AQQPMWFVE949YW
Worker AQQPMWFVE949YW.json is empty

Worker B2PU87OELNL97V
	Document 0: annotation updated 1 time(s)
	Document 1: annotation updated 3 time(s)
	Document 2: annotation updated 1 time(s)

Worker BM539KK2W9AIRX
Worker BM539KK2W9AIRX.json is empty

Worker HIOF5FD2FK3W9F
	Document 0: annotation updated 3 time(s)
	Document 1: annotation updated 14 time(s)
	Document 2: annotation updated 1 time(s)

Worker IMDORB1186UKOF
	Document 0: annotation updated 1 time(s)
	Document 1: annotation updated 1 time(s)
	Document 2: annotation updated 1 time(s)

Worker KFNXJP7XVSCPRX
	Document 0: annotation updated 1 time(s)
	Document 1: annotation updated 2 time(s)
	Document 2: annotation updated 1 time(s)

Worker NMXJ0KIWOLKHNV
	Document 0: annotation updated 6 time(s)
	Document 1: annotation updated 2 time(s)
	Document 2: annotation updated 8 time(s)

Worker OWAM2TV1SP4TU5
	Document 0: annotation updated 1 time(s)
	Document 1: annotation updated 1 time(s)
	Document 2: annotation update

### Get elapsed time

In [8]:
docs = [0.0, 0.0, 0.0]

for worker_file in data_dir.iterdir():
    with open(worker_file, encoding="utf8") as jsonfile:
        worker_data = json.load(jsonfile)
        print("Worker", worker_file.stem)
        try:
            for i in range(0, 3):
                time_elapsed = worker_data[0]["data_partial"]["documents_answers"][i][
                    "serialization"
                ][
                    "timestamps_elapsed"
                ]  # Quanto tempo viene impiegato per valutare un documento
                time_elapsed = round(time_elapsed, 1)  # Approssima a una cifra decimale
                print(
                    f"\tDocument {i}: {time_elapsed} second(s) elapsed",
                )
                docs[i] += time_elapsed

        except IndexError as error:
            print("Worker ", worker_file.name, " is empty")
        print("")

for i in range(0, 3):
    print(
        f"Document {i} average time elapsed: {round(docs[i] / workers, 1)} second(s)",
    )


Worker AQQPMWFVE949YW
Worker  AQQPMWFVE949YW.json  is empty

Worker B2PU87OELNL97V
	Document 0: 298.0 second(s) elapsed
	Document 1: 229.2 second(s) elapsed
	Document 2: 713.2 second(s) elapsed

Worker BM539KK2W9AIRX
Worker  BM539KK2W9AIRX.json  is empty

Worker HIOF5FD2FK3W9F
	Document 0: 120.3 second(s) elapsed
	Document 1: 145.6 second(s) elapsed
	Document 2: 53.0 second(s) elapsed

Worker IMDORB1186UKOF
	Document 0: 210.2 second(s) elapsed
	Document 1: 234.5 second(s) elapsed
	Document 2: 60.3 second(s) elapsed

Worker KFNXJP7XVSCPRX
	Document 0: 128.5 second(s) elapsed
	Document 1: 90.0 second(s) elapsed
	Document 2: 132.4 second(s) elapsed

Worker NMXJ0KIWOLKHNV
	Document 0: 110.0 second(s) elapsed
	Document 1: 97.1 second(s) elapsed
	Document 2: 107.8 second(s) elapsed

Worker OWAM2TV1SP4TU5
	Document 0: 95.0 second(s) elapsed
	Document 1: 108.9 second(s) elapsed
	Document 2: 68.3 second(s) elapsed

Worker RBG8K61JCTLLQD
	Document 0: 102.5 second(s) elapsed
	Document 1: 45.1 sec

### Task Matrices

In [5]:
with open(answers_file, encoding="utf8", newline="") as f:
    answers_rows = list(
        csv.DictReader(f, delimiter=",", quotechar='"')
    )  # Importa il dataset in una lista (si guarda dall'indice 1 in poi)

answers: dict[str, dict[int, dict[str, int]]] = defaultdict(
    lambda: defaultdict(lambda: dict())
)
# Il csv è ordinato rispetto al tempo, perciò vengono salvate solo le ultime risposte
for answer_cols in answers_rows:
    answers[answer_cols["worker_id"]][int(answer_cols["doc_id"])][
        "truthfulness1"
    ] = float_atoi(answer_cols["doc_truthfulness-1_index"])
    answers[answer_cols["worker_id"]][int(answer_cols["doc_id"])][
        "truthfulness2"
    ] = float_atoi(answer_cols["doc_truthfulness-2_index"])

task_matrix1 = create_task_matrix(answers, "truthfulness1")
task_matrix2 = create_task_matrix(answers, "truthfulness2")

print("Truthfulness 1:")
print_tables(task_matrix1)
print("Truthfulness 2:")
print_tables(task_matrix2)


Truthfulness 1:


0,1,2,3
,224557,171067,47357
TO5SRSVEVSUPUJ,4,3,3
OWAM2TV1SP4TU5,5,1,5
HIOF5FD2FK3W9F,5,0,5
IMDORB1186UKOF,5,1,5
KFNXJP7XVSCPRX,4,0,1
NMXJ0KIWOLKHNV,4,1,4
B2PU87OELNL97V,2,2,0
RBG8K61JCTLLQD,4,1,4
TWNMYWGHKNZTW0,3,0,5


Truthfulness 2:


0,1,2,3
,224557,171067,47357
TO5SRSVEVSUPUJ,3,2,5
OWAM2TV1SP4TU5,2,0,5
HIOF5FD2FK3W9F,5,0,5
IMDORB1186UKOF,5,1,5
KFNXJP7XVSCPRX,5,0,5
NMXJ0KIWOLKHNV,2,0,5
B2PU87OELNL97V,5,0,4
RBG8K61JCTLLQD,5,0,5
TWNMYWGHKNZTW0,5,0,5


### Pairwise Agreements

In [6]:
pairs = list(itertools.combinations(answers.keys(), 2))

pairwise_agreement1 = calc_pairwise_agreement(pairs, task_matrix1)
pairwise_agreement2 = calc_pairwise_agreement(pairs, task_matrix2)

print("Truthfulness 1:")
print_tables(task_matrix1, pairwise_agreement1)
print("Average Pairwise Agreement:", calc_average_agreement(pairwise_agreement1))

print("")

print("Truthfulness 2:")
print_tables(task_matrix2, pairwise_agreement2)
print("Average Pairwise Agreement:", calc_average_agreement(pairwise_agreement2))


Truthfulness 1:


0,1,2,3
,224557.0,171067.0,47357.0
TO5SRSVEVSUPUJ,4.0,3.0,3.0
OWAM2TV1SP4TU5,5.0,1.0,5.0
HIOF5FD2FK3W9F,5.0,0.0,5.0
IMDORB1186UKOF,5.0,1.0,5.0
KFNXJP7XVSCPRX,4.0,0.0,1.0
NMXJ0KIWOLKHNV,4.0,1.0,4.0
B2PU87OELNL97V,2.0,2.0,0.0
RBG8K61JCTLLQD,4.0,1.0,4.0
TWNMYWGHKNZTW0,3.0,0.0,5.0


Average Pairwise Agreement: 0.23

Truthfulness 2:


0,1,2,3
,224557.0,171067.0,47357.0
TO5SRSVEVSUPUJ,3.0,2.0,5.0
OWAM2TV1SP4TU5,2.0,0.0,5.0
HIOF5FD2FK3W9F,5.0,0.0,5.0
IMDORB1186UKOF,5.0,1.0,5.0
KFNXJP7XVSCPRX,5.0,0.0,5.0
NMXJ0KIWOLKHNV,2.0,0.0,5.0
B2PU87OELNL97V,5.0,0.0,4.0
RBG8K61JCTLLQD,5.0,0.0,5.0
TWNMYWGHKNZTW0,5.0,0.0,5.0


Average Pairwise Agreement: 0.6
