# Notebook to Preprocess the Conference for the Future of Europe Dataset

In [1]:
import json
import pandas as pd

from pathlib import Path
from xxhash import xxh64_hexdigest

In [2]:
def hash_text(text: str) -> str:
    text = ''.join(text.strip().lower().split())
    return xxh64_hexdigest(text, seed=42)

In [3]:
DATA_DIR = Path("../../data/conference-on-the-future-of-europe/")

In [4]:
train_data = pd.read_csv(DATA_DIR / "raw/arguments-training.tsv", sep="\t")
dev_data = pd.read_csv(DATA_DIR / "raw/arguments-validation.tsv", sep="\t")
test_data = pd.read_csv(DATA_DIR / "raw/arguments-test.tsv", sep="\t")
metadata = pd.read_csv(DATA_DIR / "raw/meta-arguments-e.tsv", sep="\t", index_col=0)

# Leave only COFE data

In [5]:
with pd.option_context("display.max_colwidth", None):
    display(metadata.head())

Unnamed: 0_level_0,URL
Argument ID,Unnamed: 1_level_1
E01001,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=3311&toggle_translations=true#comment_3311
E01002,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968?commentId=1039&toggle_translations=true#comment_1039
E01003,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968?commentId=1039&toggle_translations=true#comment_1039
E01004,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=175198&toggle_translations=true#comment_175198
E01005,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=175198&toggle_translations=true#comment_175198


In [6]:
train_data["split"] = "train"
test_data["split"] = "test"
dev_data["split"] = "validation"

dataset = pd.concat([
    train_data[train_data["Argument ID"].isin(metadata.index)],
    test_data[test_data["Argument ID"].isin(metadata.index)],
    dev_data[dev_data["Argument ID"].isin(metadata.index)],
], ignore_index=True)
dataset.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,split
0,E01002,Electric cars are not a solution to air pollution,in favor of,It would be smarter and cheaper to develop and...,train
1,E01003,Electric cars are not a solution to air pollution,in favor of,most of the electric power is produced by ther...,train
2,E01013,Biogas plants should be assessed on the basis ...,in favor of,Maize biogas plants need more energy to operat...,train
3,E01018,The EU should take responsibility for global e...,in favor of,climate change can only be successfully addres...,train
4,E01020,Companies should pay for the environmental dam...,in favor of,"If all costs, including CO2 emissions of the p...",train


# Map metadata URLs

We use the URL as a kind of identification of the author of a premise

In [7]:
metadata["claim_url"] = metadata["URL"].map(lambda x: x.split("?")[0])
metadata["premise_author"] = metadata["URL"].map(lambda x: x.split("&")[0])

with pd.option_context("display.max_colwidth", None):
    display(metadata[["claim_url", "premise_author"]].head())

Unnamed: 0_level_0,claim_url,premise_author
Argument ID,Unnamed: 1_level_1,Unnamed: 2_level_1
E01001,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=3311
E01002,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968?commentId=1039
E01003,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/968?commentId=1039
E01004,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=175198
E01005,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64,https://futureu.europa.eu/en/processes/GreenDeal/f/1/proposals/64?commentId=175198


# Map Stance to Relation

In [8]:
dataset["Stance"].value_counts()

Stance
in favor of    750
against        348
Name: count, dtype: int64

In [9]:
stance_map = {
    "in favor of": "SUPPORT",
    "against": "ATTACK"
}

dataset["relation"] = dataset["Stance"].map(stance_map)
dataset.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,split,relation
0,E01002,Electric cars are not a solution to air pollution,in favor of,It would be smarter and cheaper to develop and...,train,SUPPORT
1,E01003,Electric cars are not a solution to air pollution,in favor of,most of the electric power is produced by ther...,train,SUPPORT
2,E01013,Biogas plants should be assessed on the basis ...,in favor of,Maize biogas plants need more energy to operat...,train,SUPPORT
3,E01018,The EU should take responsibility for global e...,in favor of,climate change can only be successfully addres...,train,SUPPORT
4,E01020,Companies should pay for the environmental dam...,in favor of,"If all costs, including CO2 emissions of the p...",train,SUPPORT


# Build IDs

In [10]:
dataset["claim_id"] = dataset["Conclusion"].map(hash_text)
dataset["premise_id"] = dataset["Premise"].map(hash_text)
dataset.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,split,relation,claim_id,premise_id
0,E01002,Electric cars are not a solution to air pollution,in favor of,It would be smarter and cheaper to develop and...,train,SUPPORT,d4564e82f905436c,7f0142d0e42ca2fd
1,E01003,Electric cars are not a solution to air pollution,in favor of,most of the electric power is produced by ther...,train,SUPPORT,d4564e82f905436c,b8823561cc967800
2,E01013,Biogas plants should be assessed on the basis ...,in favor of,Maize biogas plants need more energy to operat...,train,SUPPORT,8c85a31138444eb4,8cec6833ae89decc
3,E01018,The EU should take responsibility for global e...,in favor of,climate change can only be successfully addres...,train,SUPPORT,eba468ff4a961580,8ba147a434af1682
4,E01020,Companies should pay for the environmental dam...,in favor of,"If all costs, including CO2 emissions of the p...",train,SUPPORT,8267fb326da38116,531781f47f03dab9


# Save Dataset

In [11]:
with open(DATA_DIR / "cofe-data.jl", "wt") as fh:
    for claim_id, claim_df in dataset.groupby("claim_id"):
        claim_node = {
            "dataset": "cofe",
            "id": claim_id,
            "author": metadata.loc[claim_df.iloc[0, 0], "claim_url"],
            "text": claim_df.iloc[0, 1],
            "metadata": {
                "type": "CLAIM"
            }
        }
        print(json.dumps(claim_node), file=fh)

        for pidx, premise in claim_df.iterrows():
            premise_node = {
                "dataset": "cofe",
                "id": premise["premise_id"],
                "author": metadata.loc[premise["Argument ID"], "premise_author"],
                "text": premise["Premise"],
                "metadata": {
                    "argument_id": premise["Argument ID"],
                    "relation": premise["relation"],
                    "related_to": claim_id,
                    "split": premise["split"],
                    "type": "PREMISE"
                }
            }
            print(json.dumps(premise_node), file=fh)