In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
from pathlib import Path
from src.utils.config_loader import load_config
from src.utils.seed import seed_everything

base_dir = Path(os.getcwd()).parent

config = load_config(base_dir / 'model_params.yaml')

seed_everything(config['general']['seed'])

In [None]:
import torch
import transformers
from textwrap import dedent
from src.utils.label_mapping_regplans import id_to_label, label_to_id

class LlamaForNER:
    def __init__(self, model_path: str):
        self.model_id = model_path
        self.pipeline = transformers.pipeline(
            'text-generation',
            model=self.model_id,
            model_kwargs={'torch_dtype': torch.float16} # FP16 for faster inference
        )
        self.terminators = self.pipeline.tokenizer.eos_token_id or self.pipeline.tokenizer.pad_token_id
  
    def format_prompt(self, sentence):
        prompt = dedent(
                f"""You are an expert in Natural Language Processing. Your task is to identify Named Entities (NER) in a given text.
                The possible Named Entities are exclusively 'B-FELT' and 'I-FELT'. The entities are defined as follows:

                - B-FELT: The beginning of a field zone name.
                - I-FELT: The continuation of a field zone name.
                            
                Below are some examples of sentences with their corresponding entities:

                Example 1:
                Sentence: "Adkomst til BFS1 og BFS2 skal være fra Solfjellveien ."
                Entities:
                BFS1 B-FELT
                BFS2 B-FELT
                
                Example 2:
                Sentence: "På friområdene GF1 - GF3 tillates vanlig skjøtsel av trær og vegetasjon ."
                Entities:
                GF1 B-FELT
                - I-FELT
                GF3 B-FELT

                Example 3: 
                Sentence: "Bebyggelsestype Innenfor BKS1-BKS6 og BFS2 skal det oppføres flermannsboliger , kjedeboliger og / eller rekkehus ."
                Entities:
                BKS1-BKS6 B-FELT
                BFS2 B-FELT

                Example 4:
                Sentence: "Sonene med nemningane # 1 , # 2 og # 3 gjeld automatisk freda kulturminne , dyrkingsspor med id ."
                Entities:
                # B-FELT
                1 I-FELT
                # B-FELT
                2 I-FELT
                # B-FELT
                3 I-FELT
                
                Your task is to identify the Named Entities in the following sentence: "{sentence}" """
        )
        return prompt.strip()

    def predict_ner(self, sentence, tokens, max_tokens=50, temperature=0.0, top_p=0.9):

        prompt = self.format_prompt(sentence) 

        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_tokens,
            eos_token_id=self.terminators,
            do_sample=False, # True could be useful
            temperature=temperature, # lower = more deterministic
            top_p=top_p 
        )
        
        generated_text = outputs[0]['generated_text'][len(prompt):].strip()
        pred_labels = self.extract_entities(generated_text, tokens)
        
        return {
            'tokens': tokens,
            'pred_labels': pred_labels,
            'generated_text': generated_text
        }

    def extract_entities(self, output_text, tokens):

        entities = {} # Word-label pairs

        for line in output_text.splitlines():
            parts = line.strip().split()
            if len(parts) == 2:
                word, label = parts[0], parts[1]
                entities[word] = label

        # If a token is missing in entities, it is labeled as 'O'
        pred_labels = [entities.get(token, 'O') for token in tokens]
        
        return pred_labels 

Unnamed: 0,full_text,words,labels
0,Angitte samferdselsanlegg og / eller teknisk i...,"[Angitte, samferdselsanlegg, og, /, eller, tek...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,BFS1 og BFS3 er ferdig utbygd med eneboliger .,"[BFS1, og, BFS3, er, ferdig, utbygd, med, eneb...","[B-FELT, O, B-FELT, O, O, O, O, O, O]"
2,En sammenhengende underetasje tillates etabler...,"[En, sammenhengende, underetasje, tillates, et...","[O, O, O, O, O, O, O, B-FELT, O, O, O, O, O, O]"
3,Bebyggelsestype Innenfor BKS1-BKS6 og BFS2 ska...,"[Bebyggelsestype, Innenfor, BKS1-BKS6, og, BFS...","[O, O, B-FELT, O, B-FELT, O, O, O, O, O, O, O,..."
4,"4.01 Kjørevei , offentlig SV_o , er regulert t...","[4.01, Kjørevei, ,, offentlig, SV_o, ,, er, re...","[O, O, O, O, B-FELT, O, O, O, O, O, O, O, O, O..."


In [None]:
from src.data.preprocessing import create_df
from llm_stuff.evaluation import evaluate 

val_df = create_df(base_dir / 'data/my_data/regplans-dev.conllu')

model_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 
ner_model = LlamaForNER(model_path)

all_pred_ids = []
all_true_ids = []

for idx, row in val_df[:5].iterrows():
    sentence = row['full_text']
    tokens = row['words']
    true_labels = row['labels']  

    output = ner_model.predict_ner(sentence, tokens)
    pred_labels = output['pred_labels']

    pred_ids = [label_to_id[label] for label in pred_labels]
    true_ids = [label_to_id[label] for label in true_labels]

    all_pred_ids.extend(pred_ids)
    all_true_ids.extend(true_ids)

    print(f"Sentence: {sentence}")
    print(f"Predicted Labels: {pred_labels}")
    print(f"True Labels: {true_labels}")

# Calculate evaluation metrics across all tokens in the dev set.
metrics = evaluate(all_true_ids, all_pred_ids)

print("Evaluation Metrics on Dev Set:")
print(metrics)