# Notebook to Process the BCause Dataset

Get the "CONVERTED" csv files from the [BCause Dumps Directory](https://drive.google.com/drive/folders/1Y4mIT-2jjLfEA7p4RN6c2f39QC9qPM_S)

In [None]:
import csv
import json
import pandas as pd
import unicodedata

from IPython.display import display
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
DATA_PATH = Path("../../data/bcause/")

# Data Loading

In [None]:
data = []

for csv in sorted(DATA_PATH.glob("raw/*.csv")):
    if "unina" in csv.name:
        # Ignore the ones in Italian
        continue
    df = pd.read_csv(csv).iloc[:, 1:]  # Remove index column
    df.insert(0, "filename", csv.name)
    data.append(df)

data = pd.concat(data, ignore_index=True)

# Data Cleaning

In [None]:
data = data.replace(
    r"\r+|\n+|\t+", " ", regex=True
)  # Replace tabs/newlines/carriage returns with plain space
for col in data.columns:
    data[col] = data[col].str.strip()

# Remove all the data points with debate title or node text null or empty
data = data[
    data.debate_title.notnull()
    & (data.debate_title.str.strip() != "")
    & data.node_text.notnull()
    & (data.node_text.str.strip() != "")
]

data.info()

In [None]:
node_type_map = {
    "POSITION": "Position",
    "OPPOSING ARGUMENT": "Attack",
    "SUPPORTING ARGUMENT": "Support",
}

data["mapped_node_type"] = data["node_type"].map(node_type_map)

## Clearing invalid debates

Remove all debates that were built for testing, have less than 5 nodes or aren't in English.

In [None]:
debate_count = data["debate_id"].value_counts()
invalid_debate = debate_count[debate_count < 5].index.to_list()

In [None]:
valid_data = data[~data["debate_id"].isin(invalid_debate)]
valid_data.head()

In [None]:
debate_titles = valid_data["debate_title"].value_counts().reset_index()
with pd.option_context("display.max_colwidth", None):
    display(debate_titles)

Remove the "test" debates and the one in greek.

In [None]:
invalid_titles = debate_titles.loc[[5, 10, 11], "debate_title"]
valid_data = valid_data[~valid_data["debate_title"].isin(invalid_titles)]

In [None]:
valid_data.info()

In [None]:
valid_data.filename.value_counts()

## Transformation

We proceed to transform this data into a more standard dataset

In [None]:
with open(DATA_PATH / "bcause-data.jl", "wt") as fh:
    for nidx, node in valid_data.iterrows():
        clean_node = {
            "dataset": "bcause",
            "id": node["node_id"],
            "text": unicodedata.normalize("NFKD", " ".join(node["node_text"].strip().split())),
            "metadata": {
                "filename": node["filename"],
                "author": node["node_author_id"],
                "debate": node["debate_title"],
                "type": node["mapped_node_type"],
                "related_to": node["node_connected_to"],
            },
        }
        print(json.dumps(clean_node), file=fh)