In [7]:
!pip install -q transformers datasets seqeval scikit-learn

In [21]:
!pip install -q transformers==4.31.0

In [6]:
import json
import torch
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict
from transformers import (
    BertTokenizerFast, BertForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

In [9]:
BIO_FILE = "bio_tagged_all_jobs.json"

with open(BIO_FILE, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

print(f"Loaded {len(raw_data)} jobs")

Loaded 6697 jobs


In [10]:
all_data = []
skipped = 0

for job in raw_data:
    for sent in job.get("tagged_sentences", []):
        tokens = sent.get("tokens")
        labels = sent.get("labels")

        if isinstance(tokens, str): tokens = tokens.strip().split()
        if isinstance(labels, str): labels = labels.strip().split()

        if isinstance(tokens, list) and isinstance(labels, list) and len(tokens) == len(labels):
            all_data.append({"tokens": tokens, "labels": labels})
        else:
            skipped += 1

print(f"Skipped {skipped} samples")

Skipped 0 samples


In [11]:
unique_labels = sorted({label for sample in all_data for label in sample["labels"]})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
print(f"Total labels: {len(label2id)}", label2id)

for sample in all_data:
    sample["label_ids"] = [label2id[label] for label in sample["labels"]]

Total labels: 13 {'B-CERT': 0, 'B-FIELD': 1, 'B-LANG': 2, 'B-SKILL': 3, 'B-SOFT_SKILL': 4, 'B-TOOL': 5, 'I-CERT': 6, 'I-FIELD': 7, 'I-LANG': 8, 'I-SKILL': 9, 'I-SOFT_SKILL': 10, 'I-TOOL': 11, 'O': 12}


In [12]:
train_data, val_data = train_test_split(all_data, test_size=0.1, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})

print(f"Training samples: {len(train_data)} | Validation samples: {len(val_data)}")

Training samples: 171211 | Validation samples: 19024


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    labels = example["label_ids"]
    word_ids = tokenized.word_ids()

    aligned_labels = []
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        else:
            aligned_labels.append(labels[word_idx])

    tokenized["labels"] = aligned_labels
    del tokenized["offset_mapping"]
    return tokenized

In [None]:
encoded_dataset = dataset.map(tokenize_and_align_labels, remove_columns=["tokens", "labels", "label_ids"])

Map:   0%|          | 0/171211 [00:00<?, ? examples/s]

Map:   0%|          | 0/19024 [00:00<?, ? examples/s]

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_preds, true_labels = [], []

    for pred_seq, label_seq in zip(predictions, labels):
        temp_preds, temp_labels = [], []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                temp_preds.append(id2label[p])
                temp_labels.append(id2label[l])
        true_preds.append(temp_preds)
        true_labels.append(temp_labels)

    print("\n📋 Classification Report:")
    print(classification_report(true_labels, true_preds))

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

In [None]:
args = TrainingArguments(
    output_dir="./bert_bio_model_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=True
)

collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0436,0.0437,0.980537,0.742115,0.928107,0.824755
2,0.0383,0.04044,0.981246,0.761886,0.900795,0.825538
3,0.0338,0.040136,0.982186,0.759861,0.935833,0.838716
4,0.0293,0.040711,0.982992,0.783808,0.902386,0.838928
5,0.0238,0.045854,0.983155,0.779365,0.922381,0.844863



📋 Classification Report:
              precision    recall  f1-score   support

        CERT       0.49      0.49      0.49       104
       FIELD       0.79      0.96      0.87      5623
        LANG       0.80      0.93      0.86      4221
       SKILL       0.71      0.95      0.81      2590
  SOFT_SKILL       0.67      0.98      0.80      3332
        TOOL       0.73      0.87      0.79      6135

   micro avg       0.74      0.93      0.82     22005
   macro avg       0.70      0.86      0.77     22005
weighted avg       0.74      0.93      0.83     22005


📋 Classification Report:
              precision    recall  f1-score   support

        CERT       0.52      0.62      0.56       104
       FIELD       0.78      0.98      0.87      5623
        LANG       0.81      0.95      0.87      4221
       SKILL       0.77      0.89      0.82      2590
  SOFT_SKILL       0.70      0.93      0.80      3332
        TOOL       0.75      0.79      0.77      6135

   micro avg       0.76  

TrainOutput(global_step=53505, training_loss=0.03758225473298969, metrics={'train_runtime': 7530.0127, 'train_samples_per_second': 113.686, 'train_steps_per_second': 7.106, 'total_flos': 5.592666014348928e+16, 'train_loss': 0.03758225473298969, 'epoch': 5.0})

In [None]:
model_path = "bio_ner_model_final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

with open(f"{model_path}/label_mappings.json", "w") as f:
    json.dump({"id2label": id2label, "label2id": label2id}, f, indent=4)

print("Model and tokenizer saved to `bio_ner_model_final/`")

Model and tokenizer saved to `bio_ner_model_final/`


#Inference

In [2]:
#testing on new skillphrases which are not in our predefined lists.
test_sentences = [
    "We used LangChain with a RAG pipeline connected to a vector database like Qdrant, fine-tuned a LLaMA model using LoRA and PEFT, deployed it with FastAPI, and tracked experiments via Weights & Biases, while ensuring fairness through SHAP values and RLHF."
]

In [9]:
MODEL_PATH = "bio_ner_model_final/bio_ner_model_final"

tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()

with open(f"{MODEL_PATH}/label_mappings.json", "r") as f:
    mappings = json.load(f)
id2label = {int(k): v for k, v in mappings["id2label"].items()}

for i, sentence in enumerate(test_sentences):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256, is_split_into_words=False)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)[0]

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [id2label[pred.item()] for pred in predictions]

    print(f"\nSentence : {sentence}")
    print("Tokens :")
    print(" ".join(tokens))
    print("Labels :")
    print(" ".join(labels))


Sentence : We used LangChain with a RAG pipeline connected to a vector database like Qdrant, fine-tuned a LLaMA model using LoRA and PEFT, deployed it with FastAPI, and tracked experiments via Weights & Biases, while ensuring fairness through SHAP values and RLHF.
Tokens :
[CLS] We used Lang ##C ##hai ##n with a RA ##G pipeline connected to a vector database like Q ##dra ##nt , fine - tuned a LL ##a ##MA model using Lo ##RA and P ##EF ##T , deployed it with Fast ##AP ##I , and tracked experiments via Weight ##s & B ##ias ##es , while ensuring fair ##ness through SH ##AP values and R ##L ##HF . [SEP]
Labels :
O O O B-TOOL B-TOOL B-TOOL B-TOOL O O O O O O O O O O O B-TOOL B-TOOL B-TOOL O O O O O O O O O O O O O O O O O O O O B-TOOL B-TOOL B-TOOL O O O O O O O O O O O O O O O O O B-SKILL B-SKILL O O O O O O O
