In [1]:
import csv
import json
import pandas as pd

from IPython.display import display
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
DATA_PATH = Path("../../data/bcause/")

In [3]:
data = []

for csv in sorted(DATA_PATH.glob("raw/*.csv")):
    if 'unina' in csv.name:
        continue
    df = pd.read_csv(csv).iloc[:, 1:]  # Remove index column
    df.insert(0, 'filename', csv.name)
    data.append(df)

data = pd.concat(data, ignore_index=True)
data = data.replace(r'\r+|\n+|\t+', ' ', regex=True)
for col in data.columns:
    data[col] = data[col].str.strip()

In [4]:
data = data[
    data.debate_title.notnull() & (data.debate_title.str.strip() != '') &
    data.node_text.notnull() & (data.node_text.str.strip() != '')
]

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10162 entries, 0 to 10345
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   filename           10162 non-null  object
 1   debate_id          10162 non-null  object
 2   debate_title       10162 non-null  object
 3   node_type          10162 non-null  object
 4   node_id            10162 non-null  object
 5   node_text          10162 non-null  object
 6   node_author_id     10162 non-null  object
 7   node_connected_to  5497 non-null   object
dtypes: object(8)
memory usage: 714.5+ KB


# Clearing invalid nodes

## Clearing invalid debates

Remove all debates that were built for testing, have less than 5 nodes or aren't in English.

In [5]:
debate_count = data["debate_id"].value_counts()
invalid_debate = debate_count[debate_count < 5].index.to_list()

In [6]:
valid_data = data[~data["debate_id"].isin(invalid_debate)]
valid_data.head()

Unnamed: 0,filename,debate_id,debate_title,node_type,node_id,node_text,node_author_id,node_connected_to
0,bcause-alpha.json_CONVERTED.csv,-M_1YNLl74hj1-KvS_vL,How Can Manmade Climate Change Be Reversed? Is...,POSITION,-M_1YdSI2g-N2cgAzDU-,It may be technically possible to reverse clim...,HlM8jStvA6X6JAdYpN7TdcF7m2Q2,
1,bcause-alpha.json_CONVERTED.csv,-M_1YNLl74hj1-KvS_vL,How Can Manmade Climate Change Be Reversed? Is...,OPPOSING ARGUMENT,-M_1Z58ayAjsmDKcxncU,We cannot invest trillions in an attempt to re...,euDhKeMgqUbYY2MK3vII7neGM1k2,-M_1YdSI2g-N2cgAzDU-
2,bcause-alpha.json_CONVERTED.csv,-M_1YNLl74hj1-KvS_vL,How Can Manmade Climate Change Be Reversed? Is...,SUPPORTING ARGUMENT,-M_1ZXSxGhCsmP__58E7,If governments around the world treat climate ...,ihxtSIQaIxdIw3qmHIhNM61dtZk2,-M_1YdSI2g-N2cgAzDU-
3,bcause-alpha.json_CONVERTED.csv,-M_1YNLl74hj1-KvS_vL,How Can Manmade Climate Change Be Reversed? Is...,SUPPORTING ARGUMENT,-M_1aE5oVKIC93azEXKq,Every year that passes we lose tools or opport...,NbZpWoDgerQ8HQMLyq01Mfa4FsY2,-M_1YdSI2g-N2cgAzDU-
4,bcause-alpha.json_CONVERTED.csv,-M_1YNLl74hj1-KvS_vL,How Can Manmade Climate Change Be Reversed? Is...,SUPPORTING ARGUMENT,-MamJpJaEdukwsdyTjts,It is getter harder year by year by still poss...,isW1tQXDFghR8sdBcIUCJ5xiQSC3,-M_1YdSI2g-N2cgAzDU-


In [7]:
debate_titles = valid_data["debate_title"].value_counts().reset_index()
with pd.option_context("display.max_colwidth", None):
    display(debate_titles)

Unnamed: 0,debate_title,count
0,How can we fairly reduce hate speech from newspapers websites without eliminating space for readers comments?,9424
1,How Can Manmade Climate Change Be Reversed? Is that even possible?,352
2,Do the priorities of development and decarbonisation of heritage buildings need to be adjusted in the light of the climate emergency?,112
3,Do you think removing the comment space from newspapers website is a fair choice to reduce hate speech?,79
4,What are the key factors influencing the fairness of filtering mechanisms to reduce hate speech?,31
5,Testing-DDIS,26
6,Should all humans go vegan?,23
7,Should we be worried about AI technologies?,17
8,Why is diversity crucial in environmental issues and what can we do to achieve it?,12
9,"How to make ""Living With COVID"" more than an empty political slogan?",12


Remove the "test" debates and the one in greek.

In [8]:
invalid_titles = debate_titles.loc[[5, 10, 11], "debate_title"]
valid_data = valid_data[~valid_data["debate_title"].isin(invalid_titles)]

In [9]:
valid_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10086 entries, 0 to 10345
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   filename           10086 non-null  object
 1   debate_id          10086 non-null  object
 2   debate_title       10086 non-null  object
 3   node_type          10086 non-null  object
 4   node_id            10086 non-null  object
 5   node_text          10086 non-null  object
 6   node_author_id     10086 non-null  object
 7   node_connected_to  5457 non-null   object
dtypes: object(8)
memory usage: 709.2+ KB


In [10]:
valid_data.filename.value_counts()

filename
bcause-exp-gamma.json_CONVERTED.csv      2031
bcause-exp-beta.json_CONVERTED.csv       1954
bcause-exp-delta.json_CONVERTED.csv      1948
bcause-exp-epsilon.json_CONVERTED.csv    1815
bcause-exp-theta.json_CONVERTED.csv      1676
bcause-alpha.json_CONVERTED.csv           662
Name: count, dtype: int64

## Transformation

We proceed to transform this data into a more standard dataset

In [11]:
with open(DATA_PATH / "bcause-data.jl", "wt") as fh:
    for nidx, node in valid_data.iterrows():
        clean_node = {
            "dataset": "bcause",
            "id": node["node_id"],
            "author": node["node_author_id"],
            "text": node["node_text"],
            "metadata": {
                "filename": node["filename"],
                "debate": node["debate_title"],
                "type": node["node_type"],
                "related_to": node["node_connected_to"]
            }
        }
        print(json.dumps(clean_node), file=fh)

# Building relationship dataset

In [None]:
relationships = []

for position in tqdm(data[data.node_type == 'POSITION'].itertuples(), total=data[data.node_type == 'POSITION'].shape[0]):
    for relation in data[data.node_connected_to == position.node_id].itertuples():
        relationships.append({
            "filename": position.filename,
            "debate_id": position.debate_id,
            "debate_title": position.debate_title,
            "position_id": position.node_id,
            "position_text": position.node_text,
            "argument_id": relation.node_id,
            "argument_text": relation.node_text,
            "relation": relation.node_type
        })

    for non_relation in data[
            (data.filename == position.filename) &
            (data.debate_id == position.debate_id) &
            (data.node_connected_to != position.node_id)].itertuples():
        relationships.append({
            "filename": position.filename,
            "debate_id": position.debate_id,
            "debate_title": position.debate_title,
            "position_id": position.node_id,
            "position_text": position.node_text,
            "argument_id": non_relation.node_id,
            "argument_text": non_relation.node_text,
            "relation": "NO RELATION"
        })

relationships = pd.DataFrame(relationships)
relationships.info()

In [None]:
relationships = relationships.drop_duplicates(subset=['debate_id', 'position_id', 'argument_id', 'relation'])
relationships.relation.value_counts()

In [None]:
relevant_relationships = relationships[relationships.relation != 'NO RELATION'].reset_index(drop=True)
relevant_test_relations = relevant_relationships[relevant_relationships['filename'].str.contains('alpha')]
relevant_val_relations = relevant_relationships[relevant_relationships['filename'].str.contains('beta')]
relevant_train_relations = relevant_relationships.loc[
    ~relevant_relationships.index.isin(relevant_test_relations.index.union(relevant_val_relations.index))
]

In [None]:
irrelevant_relationships = relationships[relationships.relation == 'NO RELATION'].reset_index(drop=True)

irrelevant_test_relations = irrelevant_relationships[irrelevant_relationships['filename'].str.contains('alpha')]
irrelevant_val_relations = irrelevant_relationships[irrelevant_relationships['filename'].str.contains('beta')]
irrelevant_train_relations = irrelevant_relationships.loc[
    ~irrelevant_relationships.index.isin(irrelevant_test_relations.index.union(irrelevant_val_relations.index))
]

In [None]:
irrelevant_test_relations = irrelevant_test_relations.sample(
    frac=relevant_test_relations.shape[0] * 10.5 / irrelevant_test_relations.shape[0], random_state=43)
irrelevant_val_relations = irrelevant_val_relations.sample(
    frac=relevant_val_relations.shape[0] * 10.5 / irrelevant_val_relations.shape[0], random_state=42)
irrelevant_train_relations = irrelevant_train_relations.sample(
    frac=relevant_train_relations.shape[0] * 10.5 / irrelevant_train_relations.shape[0], random_state=42)

In [None]:
test_relations = pd.concat([relevant_test_relations, irrelevant_test_relations]).sample(frac=1)
test_relations.insert(0, 'split', 'test')

val_relations = pd.concat([relevant_val_relations, irrelevant_val_relations]).sample(frac=1)
val_relations.insert(0, 'split', 'validation')

train_relations = pd.concat([relevant_train_relations, irrelevant_train_relations]).sample(frac=1)
train_relations.insert(0, 'split', 'train')

full_relations_dataset = pd.concat([train_relations, test_relations, val_relations], ignore_index=True)
full_relations_dataset.groupby(['split', 'relation']).size()

In [None]:
RELATIONS_MAP = {
    'NO RELATION': '__label__noRel',
    'OPPOSING ARGUMENT': '__label__Attack',
    'SUPPORTING ARGUMENT': '__label__Support'
}

full_relations_dataset['mapped_relation'] = full_relations_dataset.relation.map(RELATIONS_MAP)
full_relations_dataset.groupby(['split', 'mapped_relation']).size()

In [None]:
full_relations_dataset.info()

In [None]:
full_relations_dataset.to_csv(DATA_PATH / "clean/bcause-gamma-full.csv", index=False)

In [None]:
full_relations_dataset.loc[full_relations_dataset['split'] == 'train', ['mapped_relation', 'position_text', 'argument_text']].to_csv(
    '../data/bcause/clean/train.tsv', sep="\t", index=False, header=False#, quoting=csv.QUOTE_NONE
)

In [None]:
full_relations_dataset.loc[full_relations_dataset['split'] == 'test', ['mapped_relation', 'position_text', 'argument_text']].to_csv(
    '../data/bcause/clean/test.tsv', sep="\t", index=False, header=False#, quoting=csv.QUOTE_NONE
)

In [None]:
full_relations_dataset.loc[full_relations_dataset['split'] == 'validation', ['mapped_relation', 'position_text', 'argument_text']].to_csv(
    '../data/bcause/clean/validation.tsv', sep="\t", index=False, header=False#, quoting=csv.QUOTE_NONE
)