NP-CHUNKING (I-O-B tagging) with LLMs.  

Stephan Raaijmakers, LUCL, 28.03.2025

In [2]:
from transformers import AutoModelForCausalLM , AutoTokenizer, pipeline
from huggingface_hub import InferenceClient
import sys
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import re

In [4]:
from google.colab import userdata
HUGGINGFACE_TOKEN=userdata.get('HF_TOKEN')

Notice you will need a Huggingface PRO account ($9/mo) for the InferenceClient on non-free tier models.

In [5]:
def generate_iob_tags(lines, output_file):
    print("Generating IOB tags...")
    #HUGGINGFACE_TOKEN=os.environ["HUGGINGFACE_TOKEN"] # run first: export HUGGINGFACE_TOKEN="..." in shell
    outp=open(output_file,"w")

    client = InferenceClient(
                api_key=HUGGINGFACE_TOKEN,
            )
    for i in tqdm(range(len(lines))):
        sentence=' '.join(lines[i])

        messages = [
	                {
		            "role": "user",
		            "content": "Assign IOB tags for NP-chunking to the following sentence. Check that no taggings are repeated in your output. Every word in the sentence should have just one tag. If you have multiple options just output one. Tags are limited to O, B-NP, I-NP. Output format is: word/tag, per separate word. Example: The sandwich was good. Output: The/B-NP sandwich/I-NP was/O good/O. Sentence: "+sentence
	                }
                    ]

        completion = client.chat.completions.create(
                 #model="meta-llama/Meta-Llama-3-8B-Instruct",
                 model="mistralai/Mistral-7B-Instruct-v0.2",
	             messages=messages,
	             max_tokens=500,
                 temperature=0.0,
            )

        iob_tagged=completion.choices[0].message.content
        iob_tagged=re.sub("\n"," ",iob_tagged)
        iob_tagged=re.sub("Here is the output: ","",iob_tagged)
        outp.write("Sentence:%s\nTags:%s\n"%(sentence,iob_tagged))

In [12]:

def main(input_file, output_file):
    with open(input_file,"r") as f:
        lines = [z for z in [x.rstrip().split(" ") for x in f.readlines()]]
    generate_iob_tags(lines, output_file)
    print("See ", output_file)

In [13]:
main("det.txt", "iob.out")

Generating IOB tags...


100%|██████████| 8/8 [00:03<00:00,  2.59it/s]

See  iob.out



