# Dataset Preparation and Prompt Engineering

Note: CSV files for each partition (train.csv, val.csv, test.csv) should be well defined.

- train.csv/val.csv = **[ *article_id, entity_mention, start_offset, end_offset, main_role, fine_grained_roles* ]**
- test.csv = **[ *article_id, entity_mention, start_offset, end_offset* ]**

In [1]:
import os
import re
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from transformers import pipeline
from collections import Counter
from google.colab import drive
from tqdm import tqdm
import json
import nltk
from nltk.tokenize import sent_tokenize
# Download NLTK's punkt tokenizer
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Mount drive to access files
drive.mount('/content/drive')

# Define the base directory once
BASE_DIR = "/content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts"

# Function to generate full paths from base path
def path_builder(relative_path):
    """Returns the full path by combining BASE_DIR with the given relative path."""
    from pathlib import Path
    return str(Path(BASE_DIR) / relative_path)

Mounted at /content/drive


In [3]:
!ls "{path_builder('Dataset_EN_PT')}"

combined-cleaned.csv	   gt-prompts.json	test_data   val_data
generated_predictions.csv  raw-documents_EN_PT	train_data


### **Generate Prompts:**
- Full sentence extractions for snippet.
- Adjustable max_sentences sarrounding the entity_mention.
- Takes the annotation CSV files and creates prompts.
- Only for the test.csv annotation file, responses aren't generated.

In [4]:
# Taxonomy Definitions from the PDF
ROLES_TAXONOMY = {
    "Protagonist": ["Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous"],
    "Antagonist": [
        "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor",
        "Spy", "Saboteur", "Corrupt", "Incompetent", "Terrorist", "Deceiver", "Bigot"
    ],
    "Innocent": ["Forgotten", "Exploited", "Victim", "Scapegoat"]
}


def check_and_create_directory(output_file):
    """
    Check if the directory for the output file exists. If not, create it.

    Args:
        output_file (str): Path to the output file.

    Returns:
        None
    """
    output_dir = os.path.dirname(output_file)  # Get the directory part of the output file path
    if not os.path.exists(output_dir):
        print(f"Directory {output_dir} does not exist. Creating it...")
        os.makedirs(output_dir)  # Create the directory if it doesn't exist
    else:
        print(f"Directory {output_dir} already exists.")



def extract_snippet_with_sentence_check(file_path, start_offset, end_offset, max_sentences=50):
    """
    Extract a text snippet with full sentences, dynamically expanding the context.

    Args:
        file_path (str): Path to the text file.
        start_offset (int): Start index of the entity mention.
        end_offset (int): End index of the entity mention.
        max_sentences (int): Maximum number of sentences to include in the snippet.

    Returns:
        str: Text snippet including full sentences with broader context.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the entire text into sentences
        sentences = sent_tokenize(text)

        # Find the sentences containing the entity offsets
        entity_start_sentence = None
        entity_end_sentence = None
        for idx, sentence in enumerate(sentences):
            sentence_start = text.find(sentence)
            sentence_end = sentence_start + len(sentence)
            if sentence_start <= start_offset <= sentence_end:
                entity_start_sentence = idx
            if sentence_start <= end_offset <= sentence_end:
                entity_end_sentence = idx

        # If sentences are not found, fall back to raw windowed text
        if entity_start_sentence is None or entity_end_sentence is None:
            print(f"\nCould not find sentences containing the entity offsets in {file_path}. Falling back to raw windowed text.")
            snippet_start = max(0, start_offset - 100)
            snippet_end = min(len(text), end_offset + 100)
            return text[snippet_start:snippet_end].strip()

        # Expand context by including neighboring sentences
        snippet_start_idx = max(0, entity_start_sentence - max_sentences // 2)
        snippet_end_idx = min(len(sentences), entity_end_sentence + max_sentences // 2)

        # Combine the selected sentences into a full snippet
        snippet = " ".join(sentences[snippet_start_idx:snippet_end_idx])

        return snippet.strip()

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None


def add_role_descriptions_and_instructions():
    role_descriptions = "\n    ".join(
        [f"{role}: {', '.join(subroles)}." for role, subroles in ROLES_TAXONOMY.items()]
    )
    role_descriptions = f"Available Roles and Subroles:\n\n    {role_descriptions}"

    instructions = (
        "Instructions:\n\n"
        "    - The entity can belong to only one of the three main roles: Protagonist, Antagonist, or Neutral.\n"
        "    - Each main role has its own unique set of subroles. Subroles are specific to the main role and cannot overlap with other main roles.\n"
        "    - The model should output:\n"
        "        - The main role on the first line.\n"
        "        - The subroles (one or more) on the second line.\n"
        "    - No additional text, explanation, or formatting should be provided."
    )

    return role_descriptions, instructions



def create_prompts(annotation_file, raw_documents_folder, output_file, is_training=True):
    """
    Create prompts from annotations and raw documents with a custom structure.

    Args:
        annotation_file (str): Path to the cleaned annotations CSV file.
        raw_documents_folder (str): Path to the folder containing raw text documents.
        output_file (str): Path to save the final dataset as JSON.
        is_training (bool): Flag indicating whether the data is training (with responses) or validation/test.

    Returns:
        None
    """
    # Check if the output directory exists, if not create it
    check_and_create_directory(output_file)

    """
    # Check if the JSON file exists
    if os.path.exists(output_file):
        print(f"{output_file} already exists. Skipping prompt generation.")
        return  # If the file exists, do nothing
    """

    # Load the cleaned annotations
    annotations = pd.read_csv(annotation_file)
    prompts = []

    # Process each annotation with a progress bar
    for _, row in tqdm(annotations.iterrows(), total=annotations.shape[0], desc="Generating Prompts"):
        article_id = row['article_id']
        entity_mention = row['entity_mention']
        start_offset = int(row['start_offset'])
        end_offset = int(row['end_offset'])

        # Construct the file path for the raw document
        document_path = os.path.join(raw_documents_folder, article_id)

        # Extract the snippet
        snippet = extract_snippet_with_sentence_check(document_path, start_offset, end_offset)
        if snippet:
            # Add taxonomy and instructions to the prompt
            role_descriptions, instructions = add_role_descriptions_and_instructions()

            # Create the prompt
            prompt = (
                f"Text:\n{snippet}\n\n"
                f"{role_descriptions}\n\n"
                f"{instructions}\n\n"
                f"Task:\nDefine the role and subroles of '{entity_mention}'.\n"
            )

            if is_training:
                # Include the response for training data
                main_role = row['main_role']
                fine_grained_roles = ", ".join(eval(row['fine_grained_roles']))
                response = f"Role: {main_role}\nSubrole(s): {fine_grained_roles}"
                prompts.append({"prompt": prompt, "response": response})
            else:
                # No response for validation/test data
                prompts.append({"prompt": prompt})

    # Save the prompts as JSON
    pd.DataFrame(prompts).to_json(output_file, orient="records", lines=True)
    print(f"Prompts saved to {output_file}")



In [5]:
# Create training prompts
annotation_file = path_builder("Dataset_EN_PT/train_data/train.csv")
raw_documents_folder = path_builder("Dataset_EN_PT/raw-documents_EN_PT")
output_file = path_builder("Dataset_EN_PT/train_data/train-prompts.json")
create_prompts(annotation_file, raw_documents_folder, output_file, is_training=True)


# Create validation prompts
annotation_file = path_builder("Dataset_EN_PT/val_data/val.csv")
raw_documents_folder = path_builder("Dataset_EN_PT/raw-documents_EN_PT")
output_file = path_builder("Dataset_EN_PT/val_data/val-prompts.json")
create_prompts(annotation_file, raw_documents_folder, output_file, is_training=True)


# Create test prompts
annotation_file = path_builder("Dataset_EN_PT/test_data/test.csv")
raw_documents_folder = path_builder("Dataset_EN_PT/raw-documents_EN_PT")
output_file = path_builder("Dataset_EN_PT/test_data/test-prompts.json")
create_prompts(annotation_file, raw_documents_folder, output_file, is_training=False)

Directory /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/train_data already exists.


Generating Prompts: 100%|██████████| 1385/1385 [03:20<00:00,  6.91it/s]


Prompts saved to /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/train_data/train-prompts.json
Directory /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/val_data already exists.


Generating Prompts:   4%|▍         | 12/280 [00:02<01:04,  4.17it/s]


Could not find sentences containing the entity offsets in /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/raw-documents_EN_PT/EN_CC_200152.txt. Falling back to raw windowed text.


Generating Prompts: 100%|██████████| 280/280 [00:41<00:00,  6.70it/s]


Prompts saved to /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/val_data/val-prompts.json
Directory /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/test_data already exists.


Generating Prompts: 100%|██████████| 272/272 [00:41<00:00,  6.52it/s]


Prompts saved to /content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts/Dataset_EN_PT/test_data/test-prompts.json


### **Some inspections regarding the propmts**

In [6]:
# Load the JSON file
json_file_path = path_builder("Dataset_EN_PT/train_data/train-prompts.json")
training_data = pd.read_json(json_file_path, lines=True)

# Inspect a few random prompts
num_samples_to_inspect = 3  # Set the number of prompts to inspect
sampled_data = training_data.sample(num_samples_to_inspect)

print("Some sample training prompts: \n" + "_" * 50 + "\n\n")
# Print each prompt and response
for i, row in sampled_data.iterrows():
    print(f"{row['prompt']}\n{row['response']}\n{'-' * 50}")




# Load the JSON file
json_file_path = path_builder("Dataset_EN_PT/val_data/val-prompts.json")
validation_data = pd.read_json(json_file_path, lines=True)

# Inspect a few random prompts
num_samples_to_inspect = 3  # Set the number of prompts to inspect
sampled_data = validation_data.sample(num_samples_to_inspect)

print("Some sample validation prompts: \n" + "_" * 50 + "\n\n")
# Print each prompt and response
for i, row in sampled_data.iterrows():
    print(f"{row['prompt']}\n{row['response']}\n{'-' * 50}")




# Load the JSON file
json_file_path = path_builder("Dataset_EN_PT/test_data/test-prompts.json")
testing_data = pd.read_json(json_file_path, lines=True)

# Inspect a few random prompts
num_samples_to_inspect = 2  # Set the number of prompts to inspect
sampled_data = testing_data.sample(num_samples_to_inspect)

print("\n\nSome sample test prompts: \n" + "_" * 50 + "\n\n")
# Print each prompt and response
for i, row in sampled_data.iterrows():
    print(f"{row['prompt']}\n{'-' * 50}")


Some sample training prompts: 
__________________________________________________


Text:
If you thought the Great Replacement was nothing more than a conspiracy theory, you might want to think again. You can unsubscribe any time. By subscribing you agree to our Terms of Use


According to a WEF report, white people are responsible for the vast majority of the world’s problems including climate change and colonialism, and in the interests of an equitable future for the globe, the number of white people must be reduced as a priority. The WEF has enlisted the United Nations to help with the plan and the results are already playing out before our eyes. Before we dive in, subscribe to the channel if you haven’t already, and join the People’s Voice Locals community to join our incredible community and support the channel. The Netherlands has become one of the global elite’s testing grounds for the most extreme policies. Just as the Netherlands was ground zero for the elite’s war on farmers,

In [7]:
# Check the number of prompts
num_prompts = training_data.shape[0]
print(f"Number of Training Prompts: {num_prompts}")

# Check the number of unique `article_id`s
annotation_file_path = path_builder("Dataset_EN_PT/train_data/train.csv")
annotations = pd.read_csv(annotation_file_path)
unique_article_ids = annotations["article_id"].nunique()
print(f"Number of Unique Article IDs for training: {unique_article_ids}")



# Check the number of prompts
num_prompts = validation_data.shape[0]
print(f"Number of Validation Prompts: {num_prompts}")

# Check the number of unique `article_id`s
annotation_file_path = path_builder("Dataset_EN_PT/val_data/val.csv")
annotations = pd.read_csv(annotation_file_path)
unique_article_ids = annotations["article_id"].nunique()
print(f"Number of Unique Article IDs for validation: {unique_article_ids}")



# Check the number of prompts
num_prompts = testing_data.shape[0]
print(f"Number of Testing Prompts: {num_prompts}")

# Check the number of unique `article_id`s
annotation_file_path = path_builder("Dataset_EN_PT/test_data/test.csv")
annotations = pd.read_csv(annotation_file_path)
unique_article_ids = annotations["article_id"].nunique()
print(f"Number of Unique Article IDs for validation: {unique_article_ids}")


Number of Training Prompts: 1385
Number of Unique Article IDs for training: 355
Number of Validation Prompts: 280
Number of Unique Article IDs for validation: 76
Number of Testing Prompts: 272
Number of Unique Article IDs for validation: 77


In [8]:

def check_train_dataset_distribution(annotation_file_path):
    """
    Compute and display the distribution of main roles and subroles from an annotation file.

    Args:
        annotation_file_path (str): Path to the cleaned annotations CSV file.
        config: Configuration object containing main and subrole mappings.
    """
    # Load the annotations from the CSV file
    annotations = pd.read_csv(annotation_file_path)

    # Initialize role counts
    role_counts = {role: {"count": 0, "subroles": {sub: 0 for sub in subroles}}
                   for role, subroles in ROLES_TAXONOMY.items()}

    # Process each row in the annotations
    for _, row in annotations.iterrows():
        main_role = row['main_role']
        subroles = eval(row['fine_grained_roles'])  # Convert subroles string to a list

        if main_role in role_counts:
            # Update main role count
            role_counts[main_role]["count"] += 1

            # Update subrole counts
            for subrole in subroles:
                if subrole in role_counts[main_role]["subroles"]:
                    role_counts[main_role]["subroles"][subrole] += 1

    # Print distribution
    for role, data in role_counts.items():
        print(f"Role: {role} - Total Samples: {data['count']}")
        print("Subrole Distribution:")
        for subrole, count in data["subroles"].items():
            print(f"  {subrole}: {count}")
        print("-" * 50)




def identify_rare_subroles(annotation_file_path, threshold=20):
    """
    Identify and return the rare subroles (subroles with frequency below the threshold) for each main role.

    Args:
        annotation_file_path (str): Path to the cleaned annotations CSV file.
        config: Configuration object containing main and subrole mappings.
        threshold (int): Frequency threshold below which subroles are considered rare.

    Returns:
        list: A list of rare subroles across all roles.
    """
    # Load the annotations from the CSV file
    annotations = pd.read_csv(annotation_file_path)

    # Initialize role-subrole counts
    role_subrole_counts = {role: {subrole: 0 for subrole in subroles}
                           for role, subroles in ROLES_TAXONOMY.items()}

    # Process each row in the annotations
    for _, row in annotations.iterrows():
        main_role = row['main_role']
        subroles = eval(row['fine_grained_roles'])  # Convert subroles string to a list

        if main_role in role_subrole_counts:
            # Increment counts for each subrole
            for subrole in subroles:
                if subrole in role_subrole_counts[main_role]:
                    role_subrole_counts[main_role][subrole] += 1

    # Identify rare subroles and flatten the dictionary into a list
    rare_subroles = [
        subrole
        for subrole_counts in role_subrole_counts.values()
        for subrole, count in subrole_counts.items()
        if count < threshold
    ]

    return rare_subroles


In [9]:
# Annotation file path
annotation_file_path = path_builder("Dataset_EN_PT/train_data/train.csv")

# Call the function
check_train_dataset_distribution(annotation_file_path)
# Define rare subroles
rare_subroles = identify_rare_subroles(annotation_file_path, threshold=20)
print(f"Rare Subroles under 20 samples: {rare_subroles}")

Role: Protagonist - Total Samples: 333
Subrole Distribution:
  Guardian: 162
  Martyr: 13
  Peacemaker: 62
  Rebel: 32
  Underdog: 13
  Virtuous: 66
--------------------------------------------------
Role: Antagonist - Total Samples: 785
Subrole Distribution:
  Instigator: 107
  Conspirator: 112
  Tyrant: 55
  Foreign Adversary: 172
  Traitor: 16
  Spy: 2
  Saboteur: 37
  Corrupt: 75
  Incompetent: 82
  Terrorist: 83
  Deceiver: 78
  Bigot: 43
--------------------------------------------------
Role: Innocent - Total Samples: 267
Subrole Distribution:
  Forgotten: 9
  Exploited: 11
  Victim: 237
  Scapegoat: 13
--------------------------------------------------
Rare Subroles under 20 samples: ['Martyr', 'Underdog', 'Traitor', 'Spy', 'Forgotten', 'Exploited', 'Scapegoat']
