# Statements Dataset Building

In this notebook I'll be building what I call the "Statements Dataset". This is the view used by BCause, where the main idea is to understand whether a statement is either a position over a debate or an argument in favor or against a position. It treats arguments as a whole (instead of treating them at low level like the component detection and relation between components we traditionally use in argumentation mining).

In [None]:
import csv
import json
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm

# Data Load

In [None]:
DATA_DIR = Path("../../data")

In [None]:
bcause = []
with open(DATA_DIR / "bcause" / "bcause-data.jl", "rt") as fh:
    for line in fh:
        bcause.append(json.loads(line))

touche = []
with open(DATA_DIR / "touche23-valueeval" / "touche23-data.jl", "rt") as fh:
    for line in fh:
        touche.append(json.loads(line))

# Dataset for Statement Type Classification

We build a dataset to detect whether a statement is a position or an attack/support type of argument.

## Bcause

Bcause is the type of data expected, but it's also the more noisy, thus we use half of it for training and half for development (i.e., validation)

In [None]:
bcause_argument_classification = []

for instance in tqdm(bcause):
    bcause_argument_classification.append(
        {
            "statement": instance["text"],
            "label": f'__label__{instance["metadata"]["type"]}',  # Add the __label__ prefix to keep it standard
        }
    )

bcause_argument_classification = pd.DataFrame(bcause_argument_classification)
bcause_argument_classification["split"] = "train"

val_index = bcause_argument_classification.sample(frac=0.5, random_state=42).index

bcause_argument_classification.loc[val_index, "split"] = "validation"
bcause_argument_classification.groupby(["split", "label"]).size()

In [None]:
for split in bcause_argument_classification["split"].unique():
    bcause_argument_classification.loc[
        bcause_argument_classification["split"] == split, ["label", "statement"]
    ].to_csv(
        DATA_DIR / "statements" / "classification" / f"bcause-{split}.tsv",
        quoting=csv.QUOTE_NONE,
        sep="\t",
        index=False,
        header=False,
    )

## Touche

For the Touche, the arguments already have an split (train, validation, test), but the conclusions (as they are part of multiple arguments) don't, thus we only make the split over them

In [None]:
touche_argument_classification = []

for instance in tqdm(touche):
    touche_argument_classification.append(
        {
            "statement": instance["text"],
            "label": f'__label__{instance["metadata"]["type"]}',  # Add the __label__ prefix to keep it standard
            "split": instance["metadata"].get("original_split", "n/a"),
        }
    )

touche_argument_classification = pd.DataFrame(touche_argument_classification)
non_splitted = touche_argument_classification[
    touche_argument_classification["split"] == "n/a"
].index.values

train_fraction = touche_argument_classification[touche_argument_classification["split"] != "n/a"][
    "split"
]
train_fraction = train_fraction[train_fraction == "train"].shape[0] / train_fraction.shape[0]

np.random.seed(42)
np.random.shuffle(non_splitted)
train_size = int(non_splitted.shape[0] * train_fraction)
validation_size = int((non_splitted.shape[0] - train_size) / 2)

train_indices = non_splitted[:train_size]
validation_indices = non_splitted[train_size : train_size + validation_size]
test_indices = non_splitted[train_size + validation_size :]

touche_argument_classification.loc[train_indices, "split"] = "train"
touche_argument_classification.loc[test_indices, "split"] = "test"
touche_argument_classification.loc[validation_indices, "split"] = "validation"

touche_argument_classification.groupby(["split", "label"]).size()

In [None]:
for split in touche_argument_classification["split"].unique():
    touche_argument_classification.loc[
        touche_argument_classification["split"] == split, ["label", "statement"]
    ].to_csv(
        DATA_DIR / "statements" / "classification" / f"touche-{split}.tsv",
        quoting=csv.QUOTE_NONE,
        sep="\t",
        index=False,
        header=False,
    )

# Dataset for Relation Classification

We build this dataset based on the original data from Bcause and Touche.

## Bcause

Again, we build only data for training and validation from BCause.

In [None]:
bcause_argument_relation = []
map_id_to_text = {}

for instance in tqdm(bcause):
    map_id_to_text[instance["id"]] = instance["text"]  # Needed later for identifying the text

    if not isinstance(instance["metadata"].get("related_to"), str):
        # We avoid the positions to rebuild the dataset
        continue

    bcause_argument_relation.append(
        {
            "debate": instance["metadata"]["debate"],
            "source": instance["id"],
            "target": instance["metadata"]["related_to"],
            "label": f'__label__{instance["metadata"]["type"]}',  # Add the __label__ prefix to keep it standard
        }
    )

bcause_argument_relation = pd.DataFrame(bcause_argument_relation)

To build the negative relations, we take the attack/support type arguments within a debate and compare them against other positions of that same debate, thus generating "hard" negative samples, as they are somewhat related talking about the same topic. Another way to deal with this can be sampling against every other possible position of the whole dataset, but those are softer negative examples.

In [None]:
bcause_norel = []

for debate, debate_df in tqdm(bcause_argument_relation.groupby("debate")):
    positions = debate_df["target"].unique().tolist()  # Positions are always target
    for position in tqdm(positions):
        for _, statement in debate_df[debate_df["target"] == position].iterrows():
            bcause_norel.extend(
                [
                    {
                        "debate": debate,
                        "source": statement["source"],
                        "target": norel_position,
                        "label": "__label__noRel",  # Add the __label__ prefix to keep it standard
                    }
                    for norel_position in positions
                    if norel_position != position
                ]
            )

bcause_norel = (
    pd.DataFrame(bcause_norel).groupby("debate").sample(frac=0.002)
)  # We subsample, otherwise is too much

bcause_argument_relation = pd.concat([bcause_argument_relation, bcause_norel], ignore_index=True)

bcause_argument_relation["split"] = "train"

val_index = bcause_argument_relation.sample(frac=0.5, random_state=42).index

bcause_argument_relation.loc[val_index, "split"] = "validation"
bcause_argument_relation.groupby(["split", "label"]).size()

In [None]:
bcause_argument_relation["source"] = bcause_argument_relation["source"].map(map_id_to_text)
bcause_argument_relation["target"] = bcause_argument_relation["target"].map(map_id_to_text)

bcause_argument_relation = bcause_argument_relation[~bcause_argument_relation.target.isnull()]

In [None]:
for split in bcause_argument_relation["split"].unique():
    bcause_argument_relation.loc[
        bcause_argument_relation["split"] == split, ["label", "source", "target"]
    ].to_csv(
        DATA_DIR / "statements" / "relation" / f"bcause-{split}.tsv",
        quoting=csv.QUOTE_NONE,
        sep="\t",
        index=False,
        header=False,
    )

## Touche

In [None]:
touche_argument_relation = []
map_id_to_text = {}

for instance in tqdm(touche):
    map_id_to_text[instance["id"]] = instance["text"]  # Needed later for identifying the text

    if not isinstance(instance["metadata"].get("related_to"), str):
        # We avoid the positions to rebuild the dataset
        continue

    touche_argument_relation.append(
        {
            "subdataset": instance["metadata"]["subdataset"],
            "source": instance["id"],
            "target": instance["metadata"]["related_to"],
            "label": f'__label__{instance["metadata"]["type"]}',  # Add the __label__ prefix to keep it standard
            "split": instance["metadata"]["original_split"],
        }
    )

touche_argument_relation = pd.DataFrame(touche_argument_relation)

In this case we use the same subdataset, it isn't as hard as the case of Bcause, but it's still a better solution than going everything vs everything.

In [None]:
touche_norel = []

for subdataset, subdataset_df in tqdm(touche_argument_relation.groupby("subdataset")):
    positions = subdataset_df["target"].unique().tolist()  # Positions are always target
    for position in tqdm(positions):
        for _, statement in subdataset_df[subdataset_df["target"] == position].iterrows():
            touche_norel.extend(
                [
                    {
                        "subdataset": subdataset,
                        "source": statement["source"],
                        "target": norel_position,
                        "label": "__label__noRel",  # Add the __label__ prefix to keep it standard
                        "split": "n/a",
                    }
                    for norel_position in positions
                    if norel_position != position
                ]
            )

In [None]:
touche_norel = (
    pd.DataFrame(touche_norel).groupby("subdataset").sample(frac=0.01)
)  # We subsample, otherwise is too much

In [None]:
touche_argument_relation = pd.concat([touche_argument_relation, touche_norel], ignore_index=True)

In [None]:
non_splitted = touche_argument_relation[touche_argument_relation["split"] == "n/a"].index.values

train_fraction = touche_argument_relation[touche_argument_relation["split"] != "n/a"]["split"]
train_fraction = train_fraction[train_fraction == "train"].shape[0] / train_fraction.shape[0]

np.random.seed(42)
np.random.shuffle(non_splitted)
train_size = int(non_splitted.shape[0] * train_fraction)
validation_size = int((non_splitted.shape[0] - train_size) / 2)

train_indices = non_splitted[:train_size]
validation_indices = non_splitted[train_size : train_size + validation_size]
test_indices = non_splitted[train_size + validation_size :]

touche_argument_relation.loc[train_indices, "split"] = "train"
touche_argument_relation.loc[test_indices, "split"] = "test"
touche_argument_relation.loc[validation_indices, "split"] = "validation"

touche_argument_relation.groupby(["split", "label"]).size()

In [None]:
touche_argument_relation["source"] = touche_argument_relation["source"].map(map_id_to_text)
touche_argument_relation["target"] = touche_argument_relation["target"].map(map_id_to_text)

In [None]:
for split in touche_argument_relation["split"].unique():
    touche_argument_relation.loc[
        touche_argument_relation["split"] == split, ["label", "source", "target"]
    ].to_csv(
        DATA_DIR / "statements" / "relation" / f"touche-{split}.tsv",
        quoting=csv.QUOTE_NONE,
        sep="\t",
        index=False,
        header=False,
    )