# Notebook to Process the BCause Dataset

Get the "CONVERTED" csv files from the [BCause Dumps Directory](https://drive.google.com/drive/folders/1Y4mIT-2jjLfEA7p4RN6c2f39QC9qPM_S)

In [None]:
import csv
import json
import pandas as pd
import unicodedata

from IPython.display import display
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
DATA_PATH = Path("../../data/bcause/")

# Data Loading

In [None]:
data = []

for csv in sorted(DATA_PATH.glob("raw/*.csv")):
    if "unina" in csv.name:
        # Ignore the ones in Italian
        continue
    df = pd.read_csv(csv).iloc[:, 1:]  # Remove index column
    df.insert(0, "filename", csv.name)
    data.append(df)

data = pd.concat(data, ignore_index=True)

# Data Cleaning

In [None]:
data = data.replace(
    r"\r+|\n+|\t+", " ", regex=True
)  # Replace tabs/newlines/carriage returns with plain space
for col in data.columns:
    data[col] = data[col].str.strip()

# Remove all the data points with debate title or node text null or empty
data = data[
    data.debate_title.notnull()
    & (data.debate_title.str.strip() != "")
    & data.node_text.notnull()
    & (data.node_text.str.strip() != "")
]

data.info()

In [None]:
node_type_map = {
    "POSITION": "Position",
    "OPPOSING ARGUMENT": "Attack",
    "SUPPORTING ARGUMENT": "Support",
}

data["mapped_node_type"] = data["node_type"].map(node_type_map)

## Clearing invalid debates

Remove all debates that were built for testing, have less than 5 nodes or aren't in English.

In [None]:
debate_count = data["debate_id"].value_counts()
invalid_debate = debate_count[debate_count < 5].index.to_list()

In [None]:
valid_data = data[~data["debate_id"].isin(invalid_debate)]
valid_data.head()

In [None]:
debate_titles = valid_data["debate_title"].value_counts().reset_index()
with pd.option_context("display.max_colwidth", None):
    display(debate_titles)

Remove the "test" debates and the one in greek.

In [None]:
invalid_titles = debate_titles.loc[[5, 10, 11], "debate_title"]
valid_data = valid_data[~valid_data["debate_title"].isin(invalid_titles)]

In [None]:
valid_data.info()

In [None]:
valid_data.filename.value_counts()

## Transformation

We proceed to transform this data into a more standard dataset

In [None]:
with open(DATA_PATH / "bcause-data.jl", "wt") as fh:
    for nidx, node in valid_data.iterrows():
        clean_node = {
            "dataset": "bcause",
            "id": node["node_id"],
            "text": unicodedata.normalize("NFKD", " ".join(node["node_text"].strip().split())),
            "metadata": {
                "filename": node["filename"],
                "author": node["node_author_id"],
                "debate": node["debate_title"],
                "type": node["mapped_node_type"],
                "related_to": node["node_connected_to"],
            },
        }
        print(json.dumps(clean_node), file=fh)

# Building relationship dataset

In [None]:
relationships = []

for position in tqdm(
    data[data.node_type == "POSITION"].itertuples(),
    total=data[data.node_type == "POSITION"].shape[0],
):
    for relation in data[data.node_connected_to == position.node_id].itertuples():
        relationships.append(
            {
                "filename": position.filename,
                "debate_id": position.debate_id,
                "debate_title": position.debate_title,
                "position_id": position.node_id,
                "position_text": position.node_text,
                "argument_id": relation.node_id,
                "argument_text": relation.node_text,
                "relation": relation.node_type,
            }
        )

    for non_relation in data[
        (data.filename == position.filename)
        & (data.debate_id == position.debate_id)
        & (data.node_connected_to != position.node_id)
    ].itertuples():
        relationships.append(
            {
                "filename": position.filename,
                "debate_id": position.debate_id,
                "debate_title": position.debate_title,
                "position_id": position.node_id,
                "position_text": position.node_text,
                "argument_id": non_relation.node_id,
                "argument_text": non_relation.node_text,
                "relation": "NO RELATION",
            }
        )

relationships = pd.DataFrame(relationships)
relationships.info()

In [None]:
relationships = relationships.drop_duplicates(
    subset=["debate_id", "position_id", "argument_id", "relation"]
)
relationships.relation.value_counts()

In [None]:
relevant_relationships = relationships[relationships.relation != "NO RELATION"].reset_index(
    drop=True
)
relevant_test_relations = relevant_relationships[
    relevant_relationships["filename"].str.contains("alpha")
]
relevant_val_relations = relevant_relationships[
    relevant_relationships["filename"].str.contains("beta")
]
relevant_train_relations = relevant_relationships.loc[
    ~relevant_relationships.index.isin(
        relevant_test_relations.index.union(relevant_val_relations.index)
    )
]

In [None]:
irrelevant_relationships = relationships[relationships.relation == "NO RELATION"].reset_index(
    drop=True
)

irrelevant_test_relations = irrelevant_relationships[
    irrelevant_relationships["filename"].str.contains("alpha")
]
irrelevant_val_relations = irrelevant_relationships[
    irrelevant_relationships["filename"].str.contains("beta")
]
irrelevant_train_relations = irrelevant_relationships.loc[
    ~irrelevant_relationships.index.isin(
        irrelevant_test_relations.index.union(irrelevant_val_relations.index)
    )
]

In [None]:
irrelevant_test_relations = irrelevant_test_relations.sample(
    frac=relevant_test_relations.shape[0] * 10.5 / irrelevant_test_relations.shape[0],
    random_state=43,
)
irrelevant_val_relations = irrelevant_val_relations.sample(
    frac=relevant_val_relations.shape[0] * 10.5 / irrelevant_val_relations.shape[0], random_state=42
)
irrelevant_train_relations = irrelevant_train_relations.sample(
    frac=relevant_train_relations.shape[0] * 10.5 / irrelevant_train_relations.shape[0],
    random_state=42,
)

In [None]:
test_relations = pd.concat([relevant_test_relations, irrelevant_test_relations]).sample(frac=1)
test_relations.insert(0, "split", "test")

val_relations = pd.concat([relevant_val_relations, irrelevant_val_relations]).sample(frac=1)
val_relations.insert(0, "split", "validation")

train_relations = pd.concat([relevant_train_relations, irrelevant_train_relations]).sample(frac=1)
train_relations.insert(0, "split", "train")

full_relations_dataset = pd.concat(
    [train_relations, test_relations, val_relations], ignore_index=True
)
full_relations_dataset.groupby(["split", "relation"]).size()

In [None]:
RELATIONS_MAP = {
    "NO RELATION": "__label__noRel",
    "OPPOSING ARGUMENT": "__label__Attack",
    "SUPPORTING ARGUMENT": "__label__Support",
}

full_relations_dataset["mapped_relation"] = full_relations_dataset.relation.map(RELATIONS_MAP)
full_relations_dataset.groupby(["split", "mapped_relation"]).size()

In [None]:
full_relations_dataset.info()

In [None]:
full_relations_dataset.to_csv(DATA_PATH / "clean/bcause-gamma-full.csv", index=False)

In [None]:
full_relations_dataset.loc[
    full_relations_dataset["split"] == "train",
    ["mapped_relation", "position_text", "argument_text"],
].to_csv(
    "../data/bcause/clean/train.tsv",
    sep="\t",
    index=False,
    header=False,  # , quoting=csv.QUOTE_NONE
)

In [None]:
full_relations_dataset.loc[
    full_relations_dataset["split"] == "test", ["mapped_relation", "position_text", "argument_text"]
].to_csv(
    "../data/bcause/clean/test.tsv", sep="\t", index=False, header=False  # , quoting=csv.QUOTE_NONE
)

In [None]:
full_relations_dataset.loc[
    full_relations_dataset["split"] == "validation",
    ["mapped_relation", "position_text", "argument_text"],
].to_csv(
    "../data/bcause/clean/validation.tsv",
    sep="\t",
    index=False,
    header=False,  # , quoting=csv.QUOTE_NONE
)