# 1. Load Native Data

This notebook, part of the ETL phase of the project, is devoted to loading the HateXplain dataset and preparing it for the next preprocessing steps.

In [1]:
# Necessary imports
import json
import numpy as np
import polars as pl
import sys

import seaborn as sns

from pathlib import Path

# Get the absolute path of the project's root directory
ROOT_DIR = Path.resolve(Path.cwd() / "../")

# Add root directory to sys.path
sys.path.append(str(ROOT_DIR))

from src.utils.set_seed import set_seed

# Set the seed for reproducibility
rng = set_seed()


In [None]:
# Directory management
DATA_DIR = ROOT_DIR / "data"
NATIVE_DATA_DIR = DATA_DIR / "native"
PREPROCESSED_DATA_DIR = DATA_DIR / "preprocessed"



In [4]:
# Load the native dataset
with Path.open(NATIVE_DATA_DIR / "dataset.json", "r", encoding="utf-8") as file:
    dataset = json.load(file)


**The dataset contains the following fields:**
- `post_id`: the unique identifier (UID) or primary key of the data instance.
- `annotators`: The list of annotations from each annotator
    - `annotators[label]` : The label assigned by the annotator to this post. Possible values: `[Hatespeech, Offensive, Normal]`-
    - `annotators[annotator_id]` : The UID assigned to each annotator
    - `annotators[target]` : A list of target community present in the post
- `rationales`: A list of rationales selected by annotators. Each rationale represents a list with values 0 or 1. A value of 1 indicates that the token is part of the rationale selected by the annotator. The corresponding token can be retrieved using the same index position in `post_tokens`.
- `post_tokens` : The list of tokens representing the post which was annotated.

We will create a table with the following columns:
- `post_id`: the UID of the data instance.
- `labels`: a list of the labels assigned by the independent annotators to each data instance.
- `targets`: a list of the target communities assigned by each independent annotator to each data instance.
- `tokens`: the list of tokens corresponding to each data instance.
- `rationales`: a list of the rationales selected by annotators to each data instance.


In [5]:
# Extract the relevant columns from the JSON dataset
post_ids = dataset.keys()

annotations = [post["annotators"] for post in dataset.values()]

labels = [
    [annotator["label"] for annotator in annotation] for annotation in annotations
]
targets = [
    [annotator["target"] for annotator in annotation] for annotation in annotations
]

tokens = [post["post_tokens"] for post in dataset.values()]
rationales = [post["rationales"] for post in dataset.values()]



In [6]:
# Construct a Polars DataFrame with the data
df_dataset = pl.DataFrame(
    {
        "post_id": post_ids,
        "labels": labels,
        "targets": targets,
        "tokens": tokens,
        "rationales": rationales,
    },
    schema={
        "post_id": pl.String,
        "labels": pl.List(pl.String),
        "targets": pl.List(pl.List(pl.String)),
        "tokens": pl.List(pl.String),
        "rationales": pl.List(pl.List(pl.Float32)),
    },
)

# Save the DataFrame to disk in Parquet format
df_dataset.write_parquet(PREPROCESSED_DATA_DIR / "dataset_native.parquet")
