# Notebook to Preprocess the Conference for the Future of Europe Dataset

In [None]:
import json
import pandas as pd

from pathlib import Path
from xxhash import xxh64_hexdigest

In [None]:
def hash_text(text: str) -> str:
    text = ''.join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

In [None]:
DATA_DIR = Path("../../data/conference-on-the-future-of-europe/")

In [None]:
train_data = pd.read_csv(DATA_DIR / "raw/arguments-training.tsv", sep="\t")
dev_data = pd.read_csv(DATA_DIR / "raw/arguments-validation.tsv", sep="\t")
test_data = pd.read_csv(DATA_DIR / "raw/arguments-test.tsv", sep="\t")
metadata = pd.read_csv(DATA_DIR / "raw/meta-arguments-e.tsv", sep="\t", index_col=0)

# Leave only COFE data

In [None]:
with pd.option_context("display.max_colwidth", None):
    display(metadata.head())

In [None]:
train_data["split"] = "train"
test_data["split"] = "test"
dev_data["split"] = "validation"

dataset = pd.concat([
    train_data[train_data["Argument ID"].isin(metadata.index)],
    test_data[test_data["Argument ID"].isin(metadata.index)],
    dev_data[dev_data["Argument ID"].isin(metadata.index)],
], ignore_index=True)
dataset.head()

# Map metadata URLs

We use the URL as a kind of identification of the author of a premise

In [None]:
metadata["claim_url"] = metadata["URL"].map(lambda x: x.split("?")[0])
metadata["premise_author"] = metadata["URL"].map(lambda x: x.split("&")[0])

with pd.option_context("display.max_colwidth", None):
    display(metadata[["claim_url", "premise_author"]].head())

# Map Stance to Relation

In [None]:
dataset["Stance"].value_counts()

In [None]:
stance_map = {
    "in favor of": "SUPPORT",
    "against": "ATTACK"
}

dataset["relation"] = dataset["Stance"].map(stance_map)
dataset.head()

# Build IDs

In [None]:
dataset["claim_id"] = dataset["Conclusion"].map(hash_text)
dataset["premise_id"] = dataset["Premise"].map(hash_text)
dataset.head()

# Save Dataset

In [None]:
with open(DATA_DIR / "cofe-data.jl", "wt") as fh:
    for claim_id, claim_df in dataset.groupby("claim_id"):
        claim_node = {
            "dataset": "cofe",
            "id": claim_id,
            "author": metadata.loc[claim_df.iloc[0, 0], "claim_url"],
            "text": claim_df.iloc[0, 1],
            "metadata": {
                "type": "CLAIM"
            }
        }
        print(json.dumps(claim_node), file=fh)

        for pidx, premise in claim_df.iterrows():
            premise_node = {
                "dataset": "cofe",
                "id": premise["premise_id"],
                "author": metadata.loc[premise["Argument ID"], "premise_author"],
                "text": premise["Premise"],
                "metadata": {
                    "argument_id": premise["Argument ID"],
                    "relation": premise["relation"],
                    "related_to": claim_id,
                    "split": premise["split"],
                    "type": "PREMISE"
                }
            }
            print(json.dumps(premise_node), file=fh)