# Install Libraries

In [1]:
!pip install -q -U transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load Model

In [2]:
! pip install flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.7.4.post1-cp310-cp310-linux_x86_64.whl size=187797312 sha256=b267f80a08e516292cdd748056a2178a45b8abedf7fca123292eb17c21c8c87c
  Stored in directory: /root/.cache/pip/wheels/59/ce/d5/08ea07bfc16ba218dc65a3a7ef9b6a270530bcbd2cea2ee1ca
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.7.4.post1


In [3]:
import torch
import random
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model_path="/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

tokenizer=AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
    trust_remote_code = True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
%%time

def generate_essay(prompt, max_tokens):
    messages = [{
        "role":"user",
        "content": prompt
    }]

    model_inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt").to('cuda')

    if max_tokens == 0:
        max_tokens = 100
    
    # Setting `pad_token_id` to `eos_token_id` for open-ended generation.
    with torch.no_grad():
        generated_ids = model.generate(
            model_inputs,
            max_new_tokens = max_tokens,
            do_sample = True,
            pad_token_id = tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    text = decoded[0].split("[/INST]")[1]
    return text

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


# Read Prompts

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_texts.txt", "r", encoding="utf-8") as file:
    text = file.read()

text_list = text.split('\n')

with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_entities.txt", "r", encoding="utf-8") as file:
    text = file.read()

labels_list = text.split('\n')

In [6]:
df = pd.DataFrame()
df['train_text'] = text_list
df['train_labels'] = labels_list

In [7]:
df['train_text_list'] = df['train_text'].str.split(' ')
df['train_labels_list'] = df['train_labels'].str.split(' ')

In [8]:
df['unique_labels'] = df['train_labels_list'].apply(lambda x: ' '.join(list(set(x))))

In [9]:
df['document'] = np.arange(len(df))

In [10]:
all_labels = [
    'B-Extension','I-Extension','B-Application','I-Application','B-Abbreviation','B-Citation','I-Citation',
    'B-SoftwareCoreference','I-SoftwareCoreference','B-URL','I-URL','B-AlternativeName', 'I-AlternativeName',
    'B-OperatingSystem','I-OperatingSystem','B-Developer','I-Developer','O','B-License','I-License','B-PlugIn','I-PlugIn',
    'B-Release','I-Release','B-ProgrammingEnvironment','I-ProgrammingEnvironment','B-Version','I-Version']

id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [11]:
from collections import defaultdict

def bundle_entities(tokens, labels):
    entity_dict = defaultdict(list)
    current_entity_tokens = []
    current_label = None

    for token, label in zip(tokens, labels):
        if label == 'O':  # Reset when outside an entity
            if current_entity_tokens and current_label:
                entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []
            current_label = None
            continue

        # If we encounter a new entity (either 'B-' or 'I-' type)
        entity = label

        # If we are starting a new entity, process the previous one first
        if current_entity_tokens and current_label != entity:
            entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []  # Reset the tokens for the new entity

        # Add the current token to the current entity
        current_entity_tokens.append(token)
        current_label = entity

    # Add last entity if it exists
    if current_entity_tokens and current_label:
        entity_dict[current_label].append(" ".join(current_entity_tokens))

    return dict(entity_dict)

In [12]:
l = []
for i in range(0, len(df)):
    l.append(bundle_entities(df.values[i][2], df.values[i][3]))

In [13]:
# Merging all entity values
merged_entities = defaultdict(set)

for entity_dict in l:
    for entity, values in entity_dict.items():
        merged_entities[entity].update(values)  # Using a set to remove duplicates

# Convert back to lists
merged_entities = {key: list(values) for key, values in merged_entities.items()}

# Generate Texts

In [14]:
llm_essays = []
essay_counter = 0

np.random.seed(73)  
all_numbers_1 = list(range(0, len(df)))
np.random.shuffle(all_numbers_1)

np.random.seed(37)  
all_numbers_2 = list(range(0, len(df)))
np.random.shuffle(all_numbers_2) 

In [15]:
for p1, p2 in zip(all_numbers_1, all_numbers_2):
    merged_labels = {}
    try:
        for key in set(l[p1].keys()).union(set(l[p2].keys())):
            merged_labels[key] = l[p1].get(key, []) + l[p2].get(key, [])

        print(p1, p2)
            
        combined = [item for sublist in merged_labels.values() for item in sublist]
        prompt_combined = f'''
                merge these 2 texts and rephrase on syntactic and semantic level 
                text1  - {df['train_text'].values[p1]}.
                text2  - {df['train_text'].values[p2]}.
                ALSO KEEP THESE TOKENS in merged text as it is - {combined}.
                '''
        # generate the essay
        essay_output = generate_essay(prompt_combined, (len(df['train_text'].values[p1]) + len(df['train_text'].values[p2]))//1.5)
        essay_counter += 1
    
        labels = {value: key for key, values in merged_labels.items() for value in values}
    
        tokens = text.strip().split(' ')
    
        labeled_tokens = [labels.get(token, "O") for token in tokens]
        
        output = " ".join(labeled_tokens)

    
        data_output = {
            'text_id_1': p1,
            'text_id_2': p2,
            'original_text_1': df['train_text'].values[p1],
            'original_text_2': df['train_text'].values[p2],
            'labels_1': df['train_labels'].values[p1],
            'labels_2': df['train_labels'].values[p2],
            'augmented_text': essay_output,
            'labels': output
        }
        llm_essays.append(data_output)
    except:
        print(p1, p2, 'error')
        pass

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


574 1058
656 210
685 1034
986 840
906 574
504 496
353 181
869 558
616 838
111 793
534 996
1061 148
697 909
945 938
421 1071
204 748
603 11
470 846
905 69
262 865
997 508
219 595
725 408
664 92
28 875
776 944
176 233
1046 851
493 780
948 945
541 764
850 534
26 200
1051 284
70 1090
543 771
331 138
341 1103
926 65
786 678
1007 168
524 924
410 266
588 183
271 983
551 490
183 179
557 877
1066 895
433 13
412 1006
41 884
448 669
660 521
512 2
50 1069
465 38
910 264
130 952
659 1101
789 1097
924 908
833 719
24 919
180 530
171 274
406 540
740 982
416 388
1089 447
718 745
836 1020
305 162
617 153
613 271
667 369
870 790
854 1013
843 122
57 668
3 1134
464 260
619 858
1053 693
427 850
1131 763
491 49
238 358
842 544
731 631
621 1131
14 290
932 277
359 422
565 797
980 468
1140 262
27 446
751 537
441 1033
280 643
671 663
532 1028
847 815
655 986
349 730
1026 837
700 37
903 313
966 471
1042 294
124 922
172 915
959 913
1020 898
763 191
645 775
834 864
954 625
515 1109
301 253
549 1038
332 572
185 781


In [16]:
llm_essays = pd.DataFrame(llm_essays)
llm_essays.to_csv('augmented.csv', index = False)