# Install Libraries

In [1]:
!pip install -q -U transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load Model

In [2]:
! pip install flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.7.4.post1-cp310-cp310-linux_x86_64.whl size=187797312 sha256=b267f80a08e516292cdd748056a2178a45b8abedf7fca123292eb17c21c8c87c
  Stored in directory: /root/.cache/pip/wheels/59/ce/d5/08ea07bfc16ba218dc65a3a7ef9b6a270530bcbd2cea2ee1ca
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.7.4.post1


In [3]:
import torch
import random
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model_path="/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

tokenizer=AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
    trust_remote_code = True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
%%time

def generate_essay(prompt, max_tokens):
    messages = [{
        "role":"user",
        "content": prompt
    }]
    if max_tokens == 0:
        max_tokens = 100
    model_inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt").to('cuda')
    
    # Setting `pad_token_id` to `eos_token_id` for open-ended generation.
    with torch.no_grad():
        generated_ids = model.generate(
            model_inputs,
            max_new_tokens = max_tokens,
            do_sample = True,
            pad_token_id = tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    text = decoded[0].split("[/INST]")[1]
    return text

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


# Read Prompts

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_texts.txt", "r", encoding="utf-8") as file:
    text = file.read()

text_list = text.split('\n')

with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_entities.txt", "r", encoding="utf-8") as file:
    text = file.read()

labels_list = text.split('\n')

In [6]:
df = pd.DataFrame()
df['train_text'] = text_list
df['train_labels'] = labels_list

In [7]:
df['train_text_list'] = df['train_text'].str.split(' ')
df['train_labels_list'] = df['train_labels'].str.split(' ')

In [8]:
df['unique_labels'] = df['train_labels_list'].apply(lambda x: ' '.join(list(set(x))))

In [9]:
df['document'] = np.arange(len(df))

In [10]:
all_labels = [
    'B-Extension','I-Extension','B-Application','I-Application','B-Abbreviation','B-Citation','I-Citation',
    'B-SoftwareCoreference','I-SoftwareCoreference','B-URL','I-URL','B-AlternativeName', 'I-AlternativeName',
    'B-OperatingSystem','I-OperatingSystem','B-Developer','I-Developer','O','B-License','I-License','B-PlugIn','I-PlugIn',
    'B-Release','I-Release','B-ProgrammingEnvironment','I-ProgrammingEnvironment','B-Version','I-Version']

id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [11]:
from collections import defaultdict

def bundle_entities(tokens, labels):
    entity_dict = defaultdict(list)
    current_entity_tokens = []
    current_label = None

    for token, label in zip(tokens, labels):
        if label == 'O':  # Reset when outside an entity
            if current_entity_tokens and current_label:
                entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []
            current_label = None
            continue

        # If we encounter a new entity (either 'B-' or 'I-' type)
        entity = label

        # If we are starting a new entity, process the previous one first
        if current_entity_tokens and current_label != entity:
            entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []  # Reset the tokens for the new entity

        # Add the current token to the current entity
        current_entity_tokens.append(token)
        current_label = entity

    # Add last entity if it exists
    if current_entity_tokens and current_label:
        entity_dict[current_label].append(" ".join(current_entity_tokens))

    return dict(entity_dict)

In [12]:
l = []
for i in range(0, len(df)):
    l.append(bundle_entities(df.values[i][2], df.values[i][3]))

In [13]:
# Merging all entity values
merged_entities = defaultdict(set)

for entity_dict in l:
    for entity, values in entity_dict.items():
        merged_entities[entity].update(values)  # Using a set to remove duplicates

# Convert back to lists
merged_entities = {key: list(values) for key, values in merged_entities.items()}

# Generate Text

In [14]:
llm_essays = []
essay_counter = 0

random.seed(42)

# Generate a random sample of 10 integers between 0 and 1100
# sample = random.sample(range(len(df['train_text'])), len(df['train_text'])//2)
sample = random.sample(range(len(df['train_text'])), len(df['train_text']))
print(len(sample), len(df))

for p in sample:
    try:
        combined = [item for sublist in l[p].values() for item in sublist]
        prompt_combined = f'''
                Rephrase and augment this text on syntactic and semantic level
                text to be augmented  - {df['train_text'].values[p]}.
                ALSO KEEP THESE TOKENS in the final text - {combined}.
                '''
        # generate the essay
        essay_output = generate_essay(prompt_combined, len(df['train_text'].values[p])//1.5)
        essay_counter += 1
    
        labels = {value: key for key, values in l[p].items() for value in values}
    
        tokens = text.strip().split(' ')
    
        labeled_tokens = [labels.get(token, "O") for token in tokens]
        
        output = " ".join(labeled_tokens)
        print(p, essay_output)
    
        data_output = {
            'text_id': p,
            'original_text': df['train_text'].values[p],
            'augmented_text': essay_output,
            'labels': output
        }
        llm_essays.append(data_output)
    except:
        print(p, 'error')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


1150 1150
228  The analyses were conducted by utilizing STATA 11.0, a powerful statistical software developed by Statacorp, a renowned organization located in College Station, Texas. [Also keep these tokens in the final text: STATA, 11.0
51  In order to enhance the effectiveness of nonlinear labeling, this study proposes the extension of an individual atlas to multiple atlases using a recently developed, fully automated, and feature-based labeling method. This method, called Mindboggle, can be freely downloaded and is an open-source implementation of Matlab code. Specifically, Mindboggle enables the automatic labeling of complex structures based on patterns and features extracted from images.

In this approach, we use Mindboggle to label various structures in imaging data, allowing for more precise and accurate analysis. The method's ability to handle multiple atlases simultaneously makes it a powerful tool for large-scale research projects. The automation of the labeling process also 

Hopefully this notebook showed the process of generating essays using an open-source LLM / local LLM. Just swap out the model name with any model of your choice and you should be able to generate synthetic essays locally without using an API.

Enjoy!

In [15]:
llm_essays = pd.DataFrame(llm_essays)
llm_essays.to_csv('augmented.csv', index = False)