In [None]:
# -------------------- 1. Install required libraries --------------------
!pip install transformers datasets seqeval --quiet

# -------------------- 2. Prepare your label list --------------------
label_list = ["O", "B-from", "B-to", "B-date", "I-date", "B-count"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# -------------------- 3. Sample data --------------------
from datasets import Dataset

def conll_to_data(filepath):
    tokens = []
    ner_tags = []

    all_tokens = []
    all_tags = []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if all_tokens:
                    tokens.append(all_tokens)
                    ner_tags.append([label_to_id[tag] for tag in all_tags])
                    all_tokens = []
                    all_tags = []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    token, tag = splits[0], splits[1]
                    all_tokens.append(token)
                    all_tags.append(tag)
                else:
                    # Skip malformed lines
                    continue
        # Catch last sentence if file doesn't end with newline
        if all_tokens:
            tokens.append(all_tokens)
            ner_tags.append([label_to_id[tag] for tag in all_tags])

    data = {
        "tokens": tokens,
        "ner_tags": ner_tags
    }
    return data

# Token list and label IDs for each token
data = conll_to_data("train.txt")

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2)

# -------------------- 4. Load tokenizer and model --------------------
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# -------------------- 5. Tokenize and align labels --------------------
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or use I-... logic if needed
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# -------------------- 6. Training setup --------------------
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_preds = [
        [id_to_label[p] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    return {"f1": f1_score(true_labels, true_preds), "report": classification_report(true_labels, true_preds)}

training_args = TrainingArguments(
    output_dir="./ner-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -------------------- 7. Train the model --------------------
trainer.train()

# -------------------- 8. Test prediction --------------------
def predict(sentence):
    tokens = sentence.split()
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True)
    outputs = model(**inputs).logits
    predictions = outputs.argmax(dim=2)[0].tolist()
    print("\nPredictions:")
    for token, pred_id in zip(tokens, predictions[1:len(tokens)+1]):
        print(f"{token}: {id_to_label[pred_id]}")

# Example test
predict("Book a flight to mumbai from kolkata on July 5 for 3 people")


In [None]:
from dateutil.parser import parse as parse_date
from datetime import datetime

def extract_slots_from_tokens(tokens, labels):
    slots = {
        "from": "",
        "to": "",
        "date": "",
        "count": ""
    }

    current_entity = None
    buffer = []

    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            if current_entity and buffer:
                value = " ".join(buffer)
                slots[current_entity] = value
                buffer = []
            current_entity = label[2:].lower()
            buffer.append(token)
        elif label.startswith("I-") and current_entity:
            buffer.append(token)
        else:
            if current_entity and buffer:
                value = " ".join(buffer)
                slots[current_entity] = value
                buffer = []
                current_entity = None

    # Capture the last entity
    if current_entity and buffer:
        value = " ".join(buffer)
        slots[current_entity] = value

    # Post-process date
    if slots["date"]:
        try:
            parsed_date = parse_date(slots["date"], fuzzy=True, dayfirst=True)
            slots["date"] = parsed_date.strftime("%d/%m/%Y")
        except:
            pass

    return slots


In [None]:
# Updated predict with response parsing
def predict(sentence):
    tokens = sentence.split()
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True)
    outputs = model(**inputs).logits
    predictions = outputs.argmax(dim=2)[0].tolist()[1:len(tokens)+1]
    labels = [id_to_label[pred_id] for pred_id in predictions]

    print("\nToken-level Predictions:")
    for token, label in zip(tokens, labels):
        print(f"{token}: {label}")

    structured_output = extract_slots_from_tokens(tokens, labels)
    print("\nExtracted Slots:")
    print(structured_output)
