# Paraphrasing to generate more train prompts for rare subroles

- This takes in the annotations and the raw files.
- Generates 5 different variants of a given text document (in our case raw texts that are used to create prompts)
- Stores the variants as JSON to the output directory.

Code Reference:
- [1] https://github.com/mohan696matlab/Gen-AI-Mini-Projects/blob/main/Paraphraser_app/paraphraser_trial.ipynb

In [None]:
import os
import re
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from collections import Counter
from google.colab import drive
from tqdm import tqdm
import json
import torch
import nltk
from nltk.tokenize import sent_tokenize
# Download NLTK's punkt tokenizer
nltk.download('punkt_tab')

# Accessing data from drive
drive.mount('/content/drive')

# Define the base directory once
BASE_DIR = "/content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts"

# Function to generate full paths from base path
def path_builder(relative_path):
    """Returns the full path by combining BASE_DIR with the given relative path."""
    from pathlib import Path
    return str(Path(BASE_DIR) / relative_path)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


In [None]:

# Taxonomy Definitions from the PDF
ROLES_TAXONOMY = {
    "Protagonist": ["Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous"],
    "Antagonist": [
        "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor",
        "Spy", "Saboteur", "Corrupt", "Incompetent", "Terrorist", "Deceiver", "Bigot"
    ],
    "Innocent": ["Forgotten", "Exploited", "Victim", "Scapegoat"]
}

# Load the paraphrasing model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)



def paraphrase_sentence(
    sentence,
    num_beams=10,
    num_beam_groups=5,  # Use 1 if enabling sampling
    num_return_sequences=5,
    max_length=128,
    use_sampling=True  # Switch between group beam search and sampling
):
    """
    Generate multiple paraphrases for a single sentence using T5.

    Args:
        sentence (str): The sentence to paraphrase.
        num_beams (int): Number of beams for beam search.
        num_beam_groups (int): Number of beam groups for diverse beam search.
        num_return_sequences (int): Number of paraphrased sequences to return.
        max_length (int): Maximum length of the paraphrased text.
        use_sampling (bool): Use sampling instead of group beam search.

    Returns:
        list: A list of paraphrased sentences.
    """
    input_ids = tokenizer(
        f'paraphrase: {sentence}',
        return_tensors="pt",
        padding="longest",
        max_length=max_length,
        truncation=True
    ).input_ids.to(device)

    if use_sampling:
        # Sampling for diversity
        outputs = model.generate(
            input_ids,
            num_beams=1,                    # Disable beam search
            num_return_sequences=num_return_sequences,
            max_length=max_length,
            do_sample=True,                 # Enable sampling
            temperature=1.0,                # Adjust for randomness
            no_repeat_ngram_size=2,
            repetition_penalty=1.5,
        )
    else:
        # Group Beam Search for diversity
        outputs = model.generate(
            input_ids,
            num_beams=num_beams,
            num_beam_groups=num_beam_groups,
            num_return_sequences=num_return_sequences,
            max_length=max_length,
            do_sample=False,                # Disable sampling
            diversity_penalty=3.0,          # Encourage diverse outputs
            no_repeat_ngram_size=2,
            repetition_penalty=1.5,
        )

    paraphrased = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return paraphrased



def extract_snippet_with_sentence_check(file_path, start_offset, end_offset, max_sentences=50):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the entire text into sentences
        sentences = sent_tokenize(text)

        # Find the sentences containing the entity offsets
        entity_start_sentence = None
        entity_end_sentence = None
        for idx, sentence in enumerate(sentences):
            sentence_start = text.find(sentence)
            sentence_end = sentence_start + len(sentence)
            if sentence_start <= start_offset <= sentence_end:
                entity_start_sentence = idx
            if sentence_start <= end_offset <= sentence_end:
                entity_end_sentence = idx

        # If sentences are not found, fall back to raw windowed text
        if entity_start_sentence is None or entity_end_sentence is None:
            snippet_start = max(0, start_offset - 100)
            snippet_end = min(len(text), end_offset + 100)
            return text[snippet_start:snippet_end].strip()

        # Expand context by including neighboring sentences
        snippet_start_idx = max(0, entity_start_sentence - max_sentences // 2)
        snippet_end_idx = min(len(sentences), entity_end_sentence + max_sentences // 2)

        # Combine the selected sentences into a full snippet
        snippet = " ".join(sentences[snippet_start_idx:snippet_end_idx])

        return snippet.strip()

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def add_role_descriptions_and_instructions():
    role_descriptions = "\n    ".join(
        [f"{role}: {', '.join(subroles)}." for role, subroles in ROLES_TAXONOMY.items()]
    )
    role_descriptions = f"Available Roles and Subroles:\n\n    {role_descriptions}"

    instructions = (
        "Instructions:\n\n"
        "    - The entity can belong to only one of the three main roles: Protagonist, Antagonist, or Neutral.\n"
        "    - Each main role has its own unique set of subroles. Subroles are specific to the main role and cannot overlap with other main roles.\n"
        "    - The model should output:\n"
        "        - The main role on the first line.\n"
        "        - The subroles (one or more) on the second line.\n"
        "    - No additional text, explanation, or formatting should be provided."
    )

    return role_descriptions, instructions

def identify_rare_subroles(annotation_file_path, threshold=20):
    annotations = pd.read_csv(annotation_file_path)
    role_subrole_counts = {role: {subrole: 0 for subrole in subroles} for role, subroles in ROLES_TAXONOMY.items()}

    for _, row in annotations.iterrows():
        main_role = row['main_role']
        subroles = eval(row['fine_grained_roles'])
        if main_role in role_subrole_counts:
            for subrole in subroles:
                if subrole in role_subrole_counts[main_role]:
                    role_subrole_counts[main_role][subrole] += 1

    rare_subroles = [
        subrole
        for subrole_counts in role_subrole_counts.values()
        for subrole, count in subrole_counts.items()
        if count < threshold
    ]
    return rare_subroles



def create_paraphrase(rare_subroles, annotation_file, raw_documents_folder, output_file=None):
    """
    Generate multiple paraphrased prompts for rare subroles in the dataset.

    Args:
        rare_subroles (list): List of rare subroles to target.
        annotation_file (str): Path to the cleaned annotations CSV file.
        raw_documents_folder (str): Path to the folder containing raw text documents.
        output_file (str, optional): Path to save the paraphrased prompts as a JSON file.

    Returns:
        list: A list of dictionaries containing paraphrased prompts and responses.
    """
    paraphrased_prompts = []

    annotations = pd.read_csv(annotation_file)

    for _, row in tqdm(annotations.iterrows(), total=annotations.shape[0], desc="Generating Paraphrases"):
        entity_mention = row['entity_mention']
        start_offset = int(row['start_offset'])
        end_offset = int(row['end_offset'])
        subroles = eval(row['fine_grained_roles'])

        if any(rare_subrole in subroles for rare_subrole in rare_subroles):
            document_path = os.path.join(raw_documents_folder, row['article_id'])
            snippet = extract_snippet_with_sentence_check(document_path, start_offset, end_offset)

            if snippet:
                sentences = sent_tokenize(snippet)

                # Generate multiple paraphrases for each sentence
                paraphrased_variants = []
                for sentence in sentences:
                    paraphrased_sentences = paraphrase_sentence(sentence, num_return_sequences=5)
                    paraphrased_variants.append(paraphrased_sentences)

                # Flatten the list of lists to get all paraphrased variants
                paraphrased_variants_flat = [" ".join(variant) for variant in zip(*paraphrased_variants)]

                role_descriptions, instructions = add_role_descriptions_and_instructions()

                for paraphrased_text in paraphrased_variants_flat:
                    paraphrased_prompt = (
                        f"Text:\n{paraphrased_text}\n\n"
                        f"{role_descriptions}\n\n"
                        f"{instructions}\n\n"
                        f"Task:\nDefine the role and subroles of '{entity_mention}'.\n"
                    )
                    main_role = row['main_role']
                    fine_grained_roles = ", ".join(subroles)
                    response = f"Role: {main_role}\nSubrole(s): {fine_grained_roles}"
                    paraphrased_prompts.append({"prompt": paraphrased_prompt, "response": response})

    if output_file:
        pd.DataFrame(paraphrased_prompts).to_json(output_file, orient="records", lines=True)
        print(f"Paraphrased prompts saved to {output_file}")

    return paraphrased_prompts



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
annotation_file = path_builder("Dataset_EN_PT/train_data/train.csv")
raw_documents_folder = path_builder("Dataset_EN_PT/raw-documents_EN_PT")
output_file = path_builder("Dataset_EN_PT/train_data/paraphrased_prompts.json")

rare_subroles = identify_rare_subroles(annotation_file, threshold=20)

paraphrased_prompts = create_paraphrase(rare_subroles, annotation_file, raw_documents_folder, output_file)

Generating Paraphrases: 100%|██████████| 1385/1385 [22:22<00:00,  1.03it/s]

Paraphrased prompts saved to /content/drive/My Drive/NLP_Project/Dataset_EN_PT/train_data/paraphrased_prompts.json





### Just some checks.

In [None]:
def paraphrased_prompts_stats(json_file_path):
    """
    Check the number of prompts, responses, and subrole-specific counts in a JSON file.

    Args:
        json_file_path (str): Path to the JSON file containing prompts and responses.

    Returns:
        dict: A dictionary containing stats on total entries, unique prompts, unique responses, and subrole counts.
    """
    try:
        # Load the JSON file into a DataFrame
        data = pd.read_json(json_file_path, orient="records", lines=True)

        # Count total entries, unique prompts, and unique responses
        total_entries = len(data)
        unique_prompts = data['prompt'].nunique()
        unique_responses = data['response'].nunique()

        # Extract subroles from the 'response' field and count them
        subrole_counter = Counter()
        for response in data['response']:
            # Extract the subroles portion of the response (after "Subrole(s):")
            if "Subrole(s):" in response:
                subroles_text = response.split("Subrole(s):")[-1].strip()
                subroles = [subrole.strip() for subrole in subroles_text.split(",")]
                subrole_counter.update(subroles)

        # Return the stats
        return {
            "Total Entries": total_entries,
            "Unique Prompts": unique_prompts,
            "Unique Responses": unique_responses,
            "Subrole Counts": dict(subrole_counter)
        }
    except Exception as e:
        print(f"Error reading or processing the JSON file: {e}")
        return None


In [None]:
json_file_path = path_builder("Dataset_EN_PT/train_data/paraphrased_prompts.json")

counts = paraphrased_prompts_stats(json_file_path)
if counts:
    print("Summary of JSON File:")
    print(f"Total Entries: {counts['Total Entries']}")
    print(f"Unique Prompts: {counts['Unique Prompts']}")
    print(f"Unique Responses: {counts['Unique Responses']}")
    print("\nSubrole Counts:")
    for subrole, count in counts['Subrole Counts'].items():
        print(f"{subrole}: {count}")

Summary of JSON File:
Total Entries: 385
Unique Prompts: 385
Unique Responses: 14
