<a href="https://www.kaggle.com/code/rishithreddyabcdef/finetuning-llama3-2-final?scriptVersionId=258203285" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install trl==0.12.2
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118
!pip install -U "huggingface_hub[cli]"
!pip install transformers==4.46.3
!pip install accelerate==1.10.0
!pip install sentence-transformers==3.3.1
!pip install peft==0.9.0
!pip install numpy==1.26.4
!pip install evaluate
!pip install seqeval 
!pip install bitsandbytes==0.46.1
!pip install -U deepspeed

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    PreTrainedModel,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from seqeval.metrics import classification_report

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
def conll_to_sentence_tags(filepath):
    sentences = []
    tags = []
    cur_tokens, cur_tags = [], []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if cur_tokens:
                    sentences.append(cur_tokens)
                    tags.append(cur_tags)
                    cur_tokens, cur_tags = [], []
                continue
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) >= 4:
                token, upos = parts[1], parts[3]
                cur_tokens.append(token)
                cur_tags.append(upos)
    if cur_tokens:
        sentences.append(cur_tokens)
        tags.append(cur_tags)
    return pd.DataFrame({"sentence": sentences, "tags": tags})
train_df = conll_to_sentence_tags(r"/kaggle/input/conll-tel/te_mtg-ud-train.conllu")
test_df  = conll_to_sentence_tags(r"/kaggle/input/conll-tel/te_mtg-ud-test.conllu")

In [None]:
all_tag_lists = list(train_df["tags"]) + list(test_df["tags"])
label_list = sorted({str(lab).strip() for tag_list in all_tag_lists for lab in tag_list})
label2id = {lab: i for i, lab in enumerate(label_list)}
id2label = {i: lab for lab, i in label2id.items()}

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # ensure padding exists
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
base = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
base.config.pad_token_id = tokenizer.pad_token_id
base = prepare_model_for_kbit_training(base)
lora_cfg = LoraConfig(
    r=12,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="TOKEN_CLS",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

In [None]:
class LlamaForTokenClassification(PreTrainedModel):
    def __init__(self, peft_causal_model, num_labels, id2label, label2id, class_weights=None):
        config = peft_causal_model.base_model.config
        super().__init__(config)
        self.peft_causal = peft_causal_model
        self.num_labels = num_labels
        hidden = config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)
        self.config.id2label = id2label
        self.config.label2id = label2id
        # store as buffer so it moves with .to(device)
        if class_weights is not None:
            self.register_buffer("class_weights", class_weights)
        else:
            self.class_weights = None
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.peft_causal(
            input_ids=input_ids,
            attention_mask=attention_mask,
            use_cache=False,
            output_hidden_states=True,
            return_dict=True,
        )
        last_hidden = outputs.hidden_states[-1]  
        x = self.dropout(last_hidden)
        logits = self.classifier(x)             
        loss = None
        if labels is not None:
            if self.class_weights is not None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-100, weight=self.class_weights)
            else:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}


In [None]:
def compute_class_weights(train_dataset, num_labels, ignore_index=-100, smoothing=1.0):
    counts = np.zeros(num_labels, dtype=np.float64)
    for row in train_dataset["labels"]:
        for lid in row:
            if lid != ignore_index:
                counts[lid] += 1
    weights = 1.0 / (counts + smoothing)
    weights = weights * (num_labels / weights.sum())
    return torch.tensor(weights, dtype=torch.float32)
MAX_LEN = 128
def prepare_dataset(df, tokenizer, label2id, max_length=128):
    df = df.copy()
    df["tokens"] = df["sentence"].apply(lambda x: x if isinstance(x, list) else str(x).split())
    df["labels"] = df["tags"].apply(lambda x: x if isinstance(x, list) else str(x).split())
    dataset = Dataset.from_pandas(df, preserve_index=False)
    def tokenize_and_align_labels(examples, label_all_tokens=False):
        tokenized = tokenizer(
            examples["tokens"],               
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )
        all_labels = []
        for i, labels in enumerate(examples["labels"]):
            word_ids = tokenized.word_ids(batch_index=i)
            previous_word_id = None
            label_ids = []
            for word_id in word_ids:
                if word_id is None:
                    label_ids.append(-100)
                elif word_id != previous_word_id:
                    # first sub-token of this word
                    label_ids.append(label2id[labels[word_id]])
                else:
                    # subsequent sub-tokens
                    label_ids.append(label2id[labels[word_id]] if label_all_tokens else -100)
                previous_word_id = word_id
            all_labels.append(label_ids)
        tokenized["labels"] = all_labels
        return tokenized
    def _map_fn(examples):
        out = tokenize_and_align_labels(examples, label_all_tokens=False)
        out["tokens"] = examples["tokens"]
        return out
    cols_to_keep = ["tokens"]  
    return dataset.map(
        _map_fn,
        batched=True,
        remove_columns=[c for c in dataset.column_names if c not in cols_to_keep],
    )

In [None]:
train_dataset = prepare_dataset(train_df, tokenizer, label2id, max_length=MAX_LEN)
eval_dataset  = prepare_dataset(test_df,  tokenizer, label2id, max_length=MAX_LEN)
class_weights = compute_class_weights(train_dataset, num_labels=len(label_list))

In [None]:
print("Num labels:", len(label_list))
print("Labels:", label_list)
peft_causal = get_peft_model(base, lora_cfg)
peft_causal.print_trainable_parameters()

In [None]:
model = LlamaForTokenClassification(
    peft_causal_model=peft_causal,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    class_weights=class_weights
)
args = TrainingArguments(
    output_dir="./lora-pos-tokencls",
    per_device_train_batch_size=10,
    gradient_accumulation_steps=4,
    num_train_epochs=9,
    learning_rate=0.00001,
    logging_steps=10,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=10,
    fp16=torch.cuda.is_available(),
    optim='lion_32bit',
    gradient_checkpointing=False,
    report_to="tensorboard",  
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model("./lora-pos-tokencls")
tokenizer.save_pretrained("./lora-pos-tokencls")

In [None]:
print("Running prediction on eval set...")
preds = trainer.predict(eval_dataset)
pred_ids = preds.predictions.argmax(-1) 
true_labels, pred_labels = [], []
for i in range(len(eval_dataset)):
    enc = tokenizer(
        eval_dataset[i]["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    )
    word_ids = enc.word_ids(batch_index=0)
    sent_true, sent_pred = [], []
    used_words = set()
    labels_row = eval_dataset[i]["labels"] if "labels" in eval_dataset.features else None
    row_labels = eval_dataset[i]["labels"]
    for j, wid in enumerate(word_ids):
        if wid is None:
            continue
        if wid not in used_words:
            used_words.add(wid)
            true_id = row_labels[j]
            if true_id != -100:
                sent_true.append(id2label[int(true_id)])
                sent_pred.append(id2label[int(pred_ids[i][j])])

    true_labels.append(sent_true)
    pred_labels.append(sent_pred)
print(classification_report(true_labels, pred_labels, digits=4))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
y_true = [l for seq in true_labels for l in seq]
y_pred = [l for seq in pred_labels for l in seq]
active_labels = [lab for lab in label_list if lab in set(y_true)]
cm = confusion_matrix(y_true, y_pred, labels=active_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=active_labels)
plt.figure(figsize=(10, 8))
disp.plot(xticks_rotation=90, values_format="d")
plt.tight_layout()
plt.show()

In [None]:
rows = []
for tokens, gold, pred in zip(eval_dataset["tokens"], true_labels, pred_labels):
    rows.append({
        "sentence": " ".join(tokens),
        "actual tags": "[" + ", ".join(gold) + "]",
        "predicted tags": "[" + ", ".join(pred) + "]",
    })
out_df = pd.DataFrame(rows)
out_path = "pos_predictions.xlsx"
out_df.to_excel(out_path, index=False)
print(f"Predictions saved to {out_path}")
for k in range(min(5, len(rows))):
    print(
        "\nInput:", rows[k]["sentence"],
        "\nActual:", rows[k]["actual tags"],
        "\nOutput:", rows[k]["predicted tags"]
    )

In [None]:
print("Running prediction on train set...")
preds = trainer.predict(train_dataset)
pred_ids = preds.predictions.argmax(-1) 
true_labels, pred_labels = [], []
for i in range(len(train_dataset)):
    enc = tokenizer(
        train_dataset[i]["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    )
    word_ids = enc.word_ids(batch_index=0)
    sent_true, sent_pred = [], []
    used_words = set()
    labels_row = train_dataset[i]["labels"] if "labels" in train_dataset.features else None
    row_labels = train_dataset[i]["labels"]
    for j, wid in enumerate(word_ids):
        if wid is None:
            continue
        if wid not in used_words:
            used_words.add(wid)
            true_id = row_labels[j]
            if true_id != -100:
                sent_true.append(id2label[int(true_id)])
                sent_pred.append(id2label[int(pred_ids[i][j])])

    true_labels.append(sent_true)
    pred_labels.append(sent_pred)
print(classification_report(true_labels, pred_labels, digits=4))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
y_true = [l for seq in true_labels for l in seq]
y_pred = [l for seq in pred_labels for l in seq]
active_labels = [lab for lab in label_list if lab in set(y_true)]
cm = confusion_matrix(y_true, y_pred, labels=active_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=active_labels)
plt.figure(figsize=(10, 8))
disp.plot(xticks_rotation=90, values_format="d")
plt.tight_layout()
plt.show()

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation",
    model=base,
    tokenizer=tokenizer,
    device_map="auto"
)

In [None]:
system_prompt = (
    """Identify following POS tags for the given sentence including punctuation "
    "with only these tags ADJ, ADP, ADV, CCONJ, DET, NOUN, NUM, PART, PRON, "
    "PROPN, PUNCT, SCONJ, VERB. Output only tags for respective words without explanation.
    JUST GIVE IN THIS PROMPT 
    TAGS: Respective_tag for respective_word"""

)


In [None]:
def predict_tags(sentence: str):
    # num_words = len(sentence.split())
    messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": sentence}, ] 
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=False) 
    return outputs[0]["generated_text"][len(prompt):].strip()

In [None]:
prompt_test=test_df.copy(deep=True)

In [None]:
prompt_test['predict_tags'] = prompt_test['sentence'].apply(lambda x: predict_tags(' '.join(x[0])))

In [None]:
prompt_test

In [None]:
predict_tags('చూసేరండీ ?')

In [None]:
prompt_test.to_excel('prompt.xlsx')