# 6. Generate Trees

This notebook, part of the analysis phase of the project, is devoted to generating the constituency trees that will later be fed to Integrated Directional Gradients (IDG).

In [None]:
# Necessary imports
import polars as pl
import sys

from pathlib import Path

from tqdm.auto import tqdm as tqdma

import stanza

# Get the absolute path of the project's root directory
ROOT_DIR = Path.resolve(Path.cwd() / "../")

# Add root directory to sys.path
sys.path.append(str(ROOT_DIR))

from src.utils.set_seed import set_seed

# Set the seed for reproducibility
rng = set_seed()


2025-04-07 09:33:22.599469: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-07 09:33:22.663571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-07 09:33:22.676689: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-07 09:33:22.682106: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 09:33:22.752852: I tensorflow/core/platform/cpu_feature_guar

In [None]:
# Directory management
DATA_DIR = ROOT_DIR / "data"
SPLITTED_DATA_DIR = DATA_DIR / "splitted"

MODELS_DIR = ROOT_DIR / "models"

OUTPUT_DIR = ROOT_DIR / "output"
TREES_DIR = OUTPUT_DIR / "constituency_trees"
IDG_DIR = OUTPUT_DIR / "integrated_directional_gradients"


### 1. Load and process the HateXplain Dataset

In [4]:
# Load the dataframe
df_test = pl.read_parquet(SPLITTED_DATA_DIR / "test_2_classes.parquet")


In [5]:
# Label encoding
label2id = {"normal": 0, "hatespeech": 1}
id2label = {id_: label for label, id_ in label2id.items()}
target_labels = list(label2id.keys())

df_test = df_test.with_columns(
    pl.col("label").replace_strict(label2id),
)


### 2. Generate the constituency trees for the sentences

In [None]:
def generate_tree(
    nlp_pipeline: stanza.Pipeline,
    sentence: str,
) -> str:
    """Generate the constituency tree for a given sentence, process it and return it as a string.

    Args:
        nlp_pipeline (stanza.Pipeline): The Stanza NLP pipeline for processing the sentence.
        sentence (str): The input sentence for which the constituency tree is to be generated.

    Returns:
        str: The constituency tree of the sentence as a string.

    """
    # Process the sentence
    doc = nlp_pipeline(sentence)

    # Extract the constituency tree
    constituency_tree = doc.sentences[0].constituency

    # Remove the root node
    constituency_tree = constituency_tree.children[0]

    return str(constituency_tree)


In [7]:
# Create a `stanza` pipeline for tokenization and constituency tree extraction
nlp = stanza.Pipeline(
    lang="en",
    processors="tokenize,pos,constituency",
    use_gpu=True,
    package="default",
    tokenize_pretokenized=True,
    download_method=None,
)



2025-04-07 09:33:24 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2025-04-07 09:33:24 INFO: Using device: cuda
2025-04-07 09:33:24 INFO: Loading: tokenize
2025-04-07 09:33:24 INFO: Loading: pos
2025-04-07 09:33:26 INFO: Loading: constituency
2025-04-07 09:33:26 INFO: Done loading processors!


In [9]:
# Generate trees with tqdm progress bar
trees = [
    generate_tree(nlp, " ".join(tokens)) for tokens in tqdma(df_test["tokens"], desc="Generating trees")
]

# Add the new column to the DataFrame
df_test = df_test.with_columns(pl.Series("tree", trees))


Generating trees:   0%|          | 0/1376 [00:00<?, ?it/s]

In [10]:
# Save the DataFrame with the new column
df_test.write_parquet(TREES_DIR / "test_2_classes_with_trees.parquet")
