# Notebook to Preprocess the Data From Touche23-ValueEval Dataset

This is the dataset from the [Task of Semeval](https://touche.webis.de/semeval23/touche23-web/). We only use the subgroups A, D and E.

In [None]:
import json
import pandas as pd

from pathlib import Path
from xxhash import xxh64_hexdigest

In [None]:
def hash_text(text: str) -> str:
    """
    Helper function to clean and hash text in order to avoid duplications with minor changes.
    """
    text = "".join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

In [None]:
DATA_DIR = Path("../../data/touche23-valueeval/")

In [None]:
train_data = pd.read_csv(DATA_DIR / "raw/arguments-training.tsv", sep="\t")
dev_data = pd.read_csv(DATA_DIR / "raw/arguments-validation.tsv", sep="\t")
test_data = pd.read_csv(DATA_DIR / "raw/arguments-test.tsv", sep="\t")

In [None]:
train_data["split"] = "train"
test_data["split"] = "test"
dev_data["split"] = "validation"

dataset = pd.concat([train_data, test_data, dev_data], ignore_index=True)
dataset.head()

# Map Stance to Relation

In [None]:
stance_map = {
    "against": "Attack",
    "in favor of": "Support",
}

dataset["relation"] = dataset["Stance"].map(stance_map)
dataset.head()

# Build IDs

In [None]:
dataset["conclusion_id"] = dataset["Conclusion"].map(hash_text)
dataset["premise_id"] = dataset["Premise"].map(hash_text)
dataset.head()

# Save Dataset

In [None]:
with open(DATA_DIR / "touche23-data.jl", "wt") as fh:
    for conclusion_id, conclusion_df in dataset.groupby("conclusion_id"):
        conclusion_node = {
            "dataset": "touche-23",
            "id": conclusion_id,
            "text": conclusion_df.iloc[0, 1],
            "metadata": {
                "subdataset": conclusion_df.iloc[0, 0][
                    0
                ],  # The first letter of the Argument ID reveals the subdataset
                "type": "Position",
            },
        }
        print(json.dumps(conclusion_node), file=fh)

        for pidx, premise in conclusion_df.iterrows():
            premise_node = {
                "dataset": "cofe",
                "id": premise["premise_id"],
                "text": premise["Premise"],
                "metadata": {
                    "argument_id": premise["Argument ID"],
                    "subdataset": premise["Argument ID"][0],
                    "related_to": conclusion_id,
                    "original_split": premise["split"],
                    "type": premise["relation"],
                },
            }
            print(json.dumps(premise_node), file=fh)