In [None]:
from pathlib import Path
from src.utils.config_loader import load_config
from src.utils.seed import seed_everything

base_dir = Path(os.getcwd()).parent

config = load_config(base_dir / 'model_params.yaml')

seed_everything(config['general']['seed'])

In [None]:
import torch
import transformers
from textwrap import dedent
from src.utils.label_mapping_regplans import id_to_label, label_to_id

class LlamaForNER:
    def __init__(self, model_path: str):
        self.model_id = model_path
        self.pipeline = transformers.pipeline(
            'text-generation',
            model=self.model_id,
            model_kwargs={'torch_dtype': torch.float16}  # FP16 for faster inference
        )
        tokenizer = self.pipeline.tokenizer
        tokenizer.pad_token_id = tokenizer.eos_token_id 

        self.terminators = tokenizer.eos_token_id
  
    def format_prompt(self, sentence):
        prompt = dedent(
            f"""You are an expert in Natural Language Processing. Your task is to identify Named Entities (NER) in a given text.
The possible Named Entities are exclusively 'B-FELT' and 'I-FELT'. The entities are defined as follows:

- B-FELT: The beginning of a field zone name.
- I-FELT: The continuation of a field zone name.

Important Rules:
- A B-FELT must always appear before an I-FELT.
- An I-FELT cannot exist without a preceding B-FELT.
                
Below are some examples of sentences with their corresponding entities:

Example 1:
Sentence: "Adkomst til BFS1 og BFS2 skal være fra Solfjellveien ."
Entities:
BFS1 B-FELT
BFS2 B-FELT

Example 2:
Sentence: "På friområdene GF1 - GF3 tillates vanlig skjøtsel av trær og vegetasjon ."
Entities:
GF1 B-FELT
- I-FELT
GF3 B-FELT

Example 3:
Sentence: "Bebyggelsestype Innenfor BKS1-BKS6 og BFS2 skal det oppføres flermannsboliger , kjedeboliger og / eller rekkehus ."
Entities:
BKS1-BKS6 B-FELT
BFS2 B-FELT

Example 4:
Sentence: "Sonene med nemningane # 1 , # 2 og # 3 gjeld automatisk freda kulturminne , dyrkingsspor med id ."
Entities:
# B-FELT
1 I-FELT
# B-FELT
2 I-FELT
# B-FELT
3 I-FELT

When responding, please provide a detailed chain-of-thought explanation of your reasoning before the final answer. Clearly separate your response into two sections using the markers:
Chain-of-thought:
[Your detailed reasoning steps]
Final Answer:
[The list of token-label pairs]

Identify the Named Entities in the following sentence:
"{sentence}"
"""
        )
        return prompt.strip()

    def predict_ner(self, sentence, tokens, max_tokens=500, temperature=0.0, top_p=0.9):
        prompt = self.format_prompt(sentence)
        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_tokens,
            eos_token_id=self.terminators,
            pad_token_id=self.terminators,
            do_sample=False,
            temperature=temperature,
            top_p=top_p 
        )
        
        generated_text = outputs[0]['generated_text'][len(prompt):].strip()
        pred_labels, chain_of_thought = self.extract_entities(generated_text, tokens)
        
        return {
            'tokens': tokens,
            'pred_labels': pred_labels,
            'chain_of_thought': chain_of_thought,
            'generated_text': generated_text
        }

    def extract_entities(self, output_text, tokens):
        # Separate the chain-of-thought and the final answer based on markers.
        if "Final Answer:" in output_text:
            parts = output_text.split("Final Answer:", 1)
            cot_section = parts[0]
            final_section = parts[1]
            # Remove the "Chain-of-thought:" marker if present.
            if "Chain-of-thought:" in cot_section:
                chain_of_thought = cot_section.split("Chain-of-thought:", 1)[1].strip()
            else:
                chain_of_thought = cot_section.strip()
        else:
            # If markers are not found, assume the entire output is the final answer.
            chain_of_thought = ""
            final_section = output_text

        # Process the final section to extract token-label pairs.
        entities = {}
        for line in final_section.splitlines():
            parts = line.strip().split()
            if len(parts) == 2:
                word, label = parts[0], parts[1]
                entities[word] = label

        pred_labels = [entities.get(token, 'O') for token in tokens]
        return pred_labels, chain_of_thought

In [None]:
from src.data.preprocessing import create_df
from llm_stuff.evaluation import evaluate 
from tqdm import tqdm
import json

val_df = create_df(base_dir / 'data/my_data/regplans-dev.conllu')

model_path = 'meta-llama/Meta-Llama-3-8B-Instruct' 
ner_model = LlamaForNER(model_path)

all_pred_ids = []
all_true_ids = []

all_results = []

for idx, row in tqdm(val_df[:5].iterrows(), total=len(val_df[:5])):
    sentence = row['full_text']
    tokens = row['words']
    true_labels = row['labels']  

    output = ner_model.predict_ner(sentence, tokens)
    pred_labels = output['pred_labels']

    pred_ids = []
    for label in pred_labels:
        if label in label_to_id:  
            pred_ids.append(label_to_id[label])
        else:
            print(f"Warning: Unexpected label '{label}' found. Assigning default label 'O'.")
            pred_ids.append(label_to_id.get("O", -1))

    true_ids = [label_to_id[label] for label in true_labels]

    all_pred_ids.extend(pred_ids)
    all_true_ids.extend(true_ids)

    all_results.append({
        'sentence': sentence,
        'tokens': tokens,
        'true_labels': true_labels,
        'predicted_labels': pred_labels,
        'generated_text': output['generated_text']
    })

    #print(f"Sentence: {sentence}")
    #print(f"Predicted Labels: {pred_labels}")
    #print(f"True Labels: {true_labels}")

# Calculate evaluation metrics across all tokens in the dev set.
metrics = evaluate(all_true_ids, all_pred_ids)

print("Evaluation Metrics on Dev Set:")
print(metrics)

final_output = {
    'prompt': ner_model.format_prompt(''),
    'evaluation_metrics': metrics,
    'results': all_results
}

with open(base_dir / f'llm_stuff/results/Meta-Llama-3-8B-Instruct_CHAINOFTHOUGHT.json', 'w', encoding='utf-8') as f:
    json.dump(final_output, f, indent=4, ensure_ascii=False)