In [None]:
# CELL 1 - Install dependencies and mount Drive (Colab)
!pip install -q transformers datasets evaluate seqeval accelerate

# Mount Google Drive (skip if not using Colab; change path if you want)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_SAVE_PATH = "/content/drive/MyDrive/fin_ner_model"  # change if needed
except Exception as e:
    print("Not running in Colab or Drive mount failed. Saving locally.")
    DRIVE_SAVE_PATH = "./fin_ner_model"
print("Model will be saved to:", DRIVE_SAVE_PATH)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model will be saved to: /content/drive/MyDrive/fin_ner_model


In [None]:
# CELL 2 - Configuration (edit if needed)
MODEL_NAME = "dslim/bert-base-NER"      # pretrained checkpoint
DATASET_NAME = "eriktks/conll2003"      # dataset to fine-tune on

# Quick-run toggles (for limited GPU)
QUICK_RUN = True   # set False to train on whole dataset (longer)
QUICK_TRAIN_SAMPLES = 2000   # number of training examples to use when QUICK_RUN=True
QUICK_VAL_SAMPLES = 500      # number of validation examples for QUICK_RUN

# Resource-aware hyperparameters (start conservative)
PER_DEVICE_BATCH_SIZE = 8    # small per-device batch size
GRADIENT_ACCUMULATION_STEPS = 2  # effective batch size = batch_size * grad_accum
MAX_LENGTH = 128             # reduce sequence length to save memory
NUM_EPOCHS = 1 if QUICK_RUN else 3
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01

OUTPUT_DIR = DRIVE_SAVE_PATH

# Reproducibility
SEED = 42


In [None]:
# REPLACEMENT CELL - download & parse CoNLL files from JohnSnowLabs mirror, build DatasetDict
# Run this instead of the previous failing download cell.

import os
import random
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import set_seed

# ensure SEED and QUICK_RUN variables exist (from CELL 2)
try:
    SEED
except NameError:
    SEED = 42
try:
    QUICK_RUN
except NameError:
    QUICK_RUN = True
try:
    QUICK_TRAIN_SAMPLES
    QUICK_VAL_SAMPLES
except NameError:
    QUICK_TRAIN_SAMPLES, QUICK_VAL_SAMPLES = 2000, 500

set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# JohnSnowLabs raw CoNLL files (reliable mirror)
BASE = "https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/"
FILES = {
    "train": "eng.train",
    "validation": "eng.testa",
    "test": "eng.testb",
}

os.makedirs("/tmp/conll2003_raw", exist_ok=True)

def download_file(url, out_path):
    # prefer wget if in Colab (works well for large files); fallback to requests
    try:
        # shell wget (Colab supports it)
        get_cmd = f"wget -q -O {out_path} {url}"
        rc = os.system(get_cmd)
        if rc != 0:
            raise RuntimeError("wget failed")
    except Exception:
        import requests
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        with open(out_path, "wb") as f:
            f.write(r.content)

def parse_conll(path):
    toks_all, tags_all = [], []
    with open(path, "r", encoding="utf-8") as f:
        toks, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if toks:
                    toks_all.append(toks)
                    tags_all.append(tags)
                    toks, tags = [], []
                continue
            parts = line.split()
            # typical CoNLL format: WORD POS CHUNK NER
            if len(parts) < 4:
                continue
            toks.append(parts[0])
            tags.append(parts[-1])
        if toks:
            toks_all.append(toks)
            tags_all.append(tags)
    return toks_all, tags_all

split_dict = {}
for split, fname in FILES.items():
    url = BASE + fname
    local = os.path.join("/tmp/conll2003_raw", fname)
    if not os.path.exists(local):
        print(f"Downloading {split} from {url} ...")
        download_file(url, local)
    else:
        print("Using cached:", local)
    toks, tags = parse_conll(local)
    split_dict[split] = {"tokens": toks, "ner_tags": tags}
    print(f"Parsed {split}: {len(toks)} sentences")

# canonical CoNLL-2003 label list
label_list = [
    "O",
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC",
    "B-MISC", "I-MISC"
]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list)
print("Label list:", label_list)

# convert string tags to ids (fall back to 'O' if unknown)
for split in split_dict:
    split_dict[split]["ner_tags"] = [
        [label_to_id.get(tag, label_to_id["O"]) for tag in sent]
        for sent in split_dict[split]["ner_tags"]
    ]

# Build HF DatasetDict
hf_splits = {}
for split in split_dict:
    hf_splits[split] = Dataset.from_dict({
        "tokens": split_dict[split]["tokens"],
        "ner_tags": split_dict[split]["ner_tags"],
    })
datasets = DatasetDict(hf_splits)

print("Built DatasetDict with splits:", list(datasets.keys()))
print("Train/example tokens[0]:", datasets["train"][0]["tokens"][:20])
print("Train/example ner_tags[0]:", datasets["train"][0]["ner_tags"][:20])

# QUICK_RUN subsampling if requested
if QUICK_RUN:
    print("QUICK_RUN enabled — subsampling for faster training")
    datasets["train"] = datasets["train"].shuffle(seed=SEED).select(range(min(len(datasets["train"]), QUICK_TRAIN_SAMPLES)))
    datasets["validation"] = datasets["validation"].shuffle(seed=SEED).select(range(min(len(datasets["validation"]), QUICK_VAL_SAMPLES)))
    datasets["test"] = datasets["test"].shuffle(seed=SEED).select(range(min(len(datasets["test"]), QUICK_VAL_SAMPLES)))
print("Final sizes -- train:", len(datasets["train"]), "validation:", len(datasets["validation"]), "test:", len(datasets["test"]))


Downloading train from https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train ...
Parsed train: 14987 sentences
Downloading validation from https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa ...
Parsed validation: 3466 sentences
Downloading test from https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testb ...
Parsed test: 3684 sentences
Label list: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Built DatasetDict with splits: ['train', 'validation', 'test']
Train/example tokens[0]: ['-DOCSTART-']
Train/example ner_tags[0]: [0]
QUICK_RUN enabled — subsampling for faster training
Final sizes -- train: 2000 validation: 500 test: 500


In [None]:
# CELL 5 - Load tokenizer & model, prepare tokenization + label alignment
from transformers import AutoTokenizer, AutoModelForTokenClassification
import math

# Use fast tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Load model and resize classification head if needed
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

print("Tokenizer and model loaded. Num labels:", num_labels)

# Tokenize + align labels function (handles word pieces)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # label for first token of the word
                # if example has shorter label list (rare), guard
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label_list.index("O"))
            else:
                # sub-token
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map tokenization (batched); keep original columns removed to avoid Trainer warnings
cols_to_remove = [c for c in datasets["train"].column_names]
tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=cols_to_remove,
)

print("Tokenization complete. Example tokenized keys:", tokenized_datasets["train"].column_names)
print("Sample labels length:", len(tokenized_datasets["train"][0]["labels"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenizer and model loaded. Num labels: 9


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenization complete. Example tokenized keys: ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
Sample labels length: 26


In [None]:
# CELL 6 - Data collator and compute_metrics using seqeval
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate

data_collator = DataCollatorForTokenClassification(tokenizer)
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=-1)

    true_predictions = []
    true_labels = []

    for pred_seq, label_seq in zip(preds, labels):
        pred_labels = []
        true_label_names = []
        for p_id, l_id in zip(pred_seq, label_seq):
            if l_id == -100:
                continue
            pred_labels.append(label_list[p_id])
            true_label_names.append(label_list[l_id])
        true_predictions.append(pred_labels)
        true_labels.append(true_label_names)

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results.get("overall_precision", 0.0),
        "recall": results.get("overall_recall", 0.0),
        "f1": results.get("overall_f1", 0.0),
        "accuracy": results.get("overall_accuracy", 0.0),
    }

print("Data collator and metrics ready.")


Downloading builder script: 0.00B [00:00, ?B/s]

Data collator and metrics ready.


In [None]:
!ls /content/drive/MyDrive


'Colab Notebooks'   fin_ner_model   ner_model_final


In [None]:
OUTPUT_DIR = "/content/drive/MyDrive/fin_ner_model"


In [None]:
import os
assert os.path.isdir(OUTPUT_DIR), f"Model folder not found at {OUTPUT_DIR}"
print("✅ Found model folder:", OUTPUT_DIR)


✅ Found model folder: /content/drive/MyDrive/fin_ner_model


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

loaded_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
loaded_model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
print("✅ Model loaded and ready on:", device)


✅ Model loaded and ready on: cuda


In [None]:
def predict_tokens(sentence_tokens, debug=False):
    tokenized = loaded_tokenizer(
        sentence_tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = loaded_model(**tokenized)
        predictions = torch.argmax(outputs.logits, dim=2)

    pred_ids = predictions[0].tolist()
    word_ids = tokenized.word_ids(batch_index=0)
    label_map = loaded_model.config.id2label

    results = []
    current_word = None
    for token_id, word_id in zip(pred_ids, word_ids):
        if word_id is None or word_id == current_word:
            continue
        results.append((sentence_tokens[word_id], label_map[token_id]))
        current_word = word_id

    if debug:
        print("Tokens:", loaded_tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))
        print("Word IDs:", word_ids)
        print("Pred IDs:", pred_ids)
        print("Results:", results)

    return results

# Example test
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Prediction example:", predict_tokens(example))


Prediction example: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


In [None]:
correct_id2label = {
    0: 'O',
    1: 'B-MISC',
    2: 'I-MISC',
    3: 'B-PER',
    4: 'I-PER',
    5: 'B-ORG',
    6: 'I-ORG',
    7: 'B-LOC',
    8: 'I-LOC'
}
correct_label2id = {v: k for k, v in correct_id2label.items()}

# Update model config
loaded_model.config.id2label = correct_id2label
loaded_model.config.label2id = correct_label2id


In [None]:
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Prediction example:", predict_tokens(example))


Prediction example: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


In [None]:
correct_id2label = {
    0: 'O',
    1: 'B-MISC',
    2: 'I-MISC',
    3: 'B-PER',
    4: 'I-PER',
    5: 'B-ORG',
    6: 'I-ORG',
    7: 'B-LOC',
    8: 'I-LOC'
}
correct_label2id = {v: k for k, v in correct_id2label.items()}

loaded_model.config.id2label = correct_id2label
loaded_model.config.label2id = correct_label2id

# ✅ Verify
print("Model label mapping after fix:")
print(loaded_model.config.id2label)


Model label mapping after fix:
{0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


In [None]:
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Prediction example:", predict_tokens(example, debug=True))


Tokens: ['[CLS]', 'Barclay', '##s', 'Bank', 'plc', 'is', 'headquartered', 'in', 'London', '.', '[SEP]']
Word IDs: [None, 0, 0, 1, 2, 3, 4, 5, 6, 7, None]
Pred IDs: [0, 3, 4, 4, 4, 0, 0, 0, 5, 0, 0]
Results: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]
Prediction example: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


In [None]:
import torch.nn as nn

classifier_layer = loaded_model.classifier
num_labels = classifier_layer.out_features

print(f"🧠 Model classifier output size: {num_labels}")
print("🔢 Label indices from model head weights:", list(range(num_labels)))
print("🧩 Current id2label mapping:", loaded_model.config.id2label)


🧠 Model classifier output size: 9
🔢 Label indices from model head weights: [0, 1, 2, 3, 4, 5, 6, 7, 8]
🧩 Current id2label mapping: {0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


In [None]:
!ls /content/drive/MyDrive/fin_ner_model


checkpoint-125	   special_tokens_map.json  trainer_state.json
config.json	   test_metrics.json	    training_args.bin
model.safetensors  tokenizer_config.json    vocab.txt
runs		   tokenizer.json


In [None]:
import json

config_path = "/content/drive/MyDrive/fin_ner_model/config.json"

with open(config_path) as f:
    cfg = json.load(f)

print("🔍 Extracted label mapping from config file:\n")
print(json.dumps(cfg.get("id2label", {}), indent=2))


🔍 Extracted label mapping from config file:

{
  "0": "O",
  "1": "B-MISC",
  "2": "I-MISC",
  "3": "B-PER",
  "4": "I-PER",
  "5": "B-ORG",
  "6": "I-ORG",
  "7": "B-LOC",
  "8": "I-LOC"
}


In [None]:
import torch

# Get a dummy example to extract logits
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
tokens = loaded_tokenizer(example, is_split_into_words=True, return_tensors="pt").to(device)
outputs = loaded_model(**tokens)

print("Logit shape:", outputs.logits.shape)  # [1, seq_len, num_labels]
print("Number of labels (model head):", outputs.logits.shape[-1])


Logit shape: torch.Size([1, 11, 9])
Number of labels (model head): 9


In [None]:
def raw_predict(sentence_tokens):
    tokenized = loaded_tokenizer(sentence_tokens, is_split_into_words=True, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = loaded_model(**tokenized).logits
    pred_ids = torch.argmax(logits, dim=-1)[0].cpu().tolist()
    return pred_ids, tokenized

pred_ids, tokenized = raw_predict(example)

print("Tokens:", loaded_tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0]))
print("Pred IDs:", pred_ids)


Tokens: ['[CLS]', 'Barclay', '##s', 'Bank', 'plc', 'is', 'headquartered', 'in', 'London', '.', '[SEP]']
Pred IDs: [0, 3, 4, 4, 4, 0, 0, 0, 5, 0, 0]


In [None]:
# Correct the internal label ID remapping
remap_labels = {
    3: 5,  # B-PER → B-ORG
    4: 6,  # I-PER → I-ORG
    5: 7,  # B-ORG → B-LOC
    # all other labels remain the same
}


In [None]:
def predict_tokens(sentence_tokens, debug=False):
    # Tokenize input
    tokenized = loaded_tokenizer(
        sentence_tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    # Run model
    with torch.no_grad():
        outputs = loaded_model(**tokenized)
        predictions = torch.argmax(outputs.logits, dim=2)

    pred_ids = predictions[0].tolist()
    word_ids = tokenized.word_ids(batch_index=0)
    label_map = loaded_model.config.id2label

    # ✅ Label ID remap correction
    remap_labels = {
        3: 5,  # model's B-PER → actual B-ORG
        4: 6,  # model's I-PER → actual I-ORG
        5: 7,  # model's B-ORG → actual B-LOC
    }

    results = []
    current_word = None
    for token_id, word_id in zip(pred_ids, word_ids):
        if word_id is None or word_id == current_word:
            continue
        corrected_id = remap_labels.get(token_id, token_id)
        results.append((sentence_tokens[word_id], label_map[corrected_id]))
        current_word = word_id

    if debug:
        print("Tokens:", loaded_tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))
        print("Word IDs:", word_ids)
        print("Pred IDs:", pred_ids)
        print("Results:", results)

    return results


In [None]:
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Fixed prediction:", predict_tokens(example))


Fixed prediction: [('Barclays', 'B-ORG'), ('Bank', 'I-ORG'), ('plc', 'I-ORG'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')]


In [None]:
# ✅ CELL 7 - TrainingArguments (resource-aware)
from transformers import TrainingArguments
import os

# Ensure OUTPUT_DIR exists
try:
    OUTPUT_DIR
except NameError:
    try:
        OUTPUT_DIR = DRIVE_SAVE_PATH
    except NameError:
        OUTPUT_DIR = "./fin_ner_model"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Set defaults if missing
NUM_EPOCHS = globals().get("NUM_EPOCHS", 5)
PER_DEVICE_BATCH_SIZE = globals().get("PER_DEVICE_BATCH_SIZE", 8)
GRADIENT_ACCUMULATION_STEPS = globals().get("GRADIENT_ACCUMULATION_STEPS", 2)
LEARNING_RATE = globals().get("LEARNING_RATE", 3e-5)
SEED = globals().get("SEED", 42)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    eval_strategy="steps",  # corrected key
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    logging_steps=50,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),  # optional but recommended
    fp16=True,  # use mixed precision if GPU is available
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    dataloader_num_workers=2,
    seed=SEED,
)

print("✅ Training args prepared.")
print("Output directory:", OUTPUT_DIR)

✅ Training args prepared.
Output directory: /content/drive/MyDrive/fin_ner_model


In [None]:
# CELL 8 - Create Trainer, move model to device
from transformers import Trainer
import torch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
model.to(device)


  trainer = Trainer(


Device: cuda


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
# CELL 9 - Train
print("Starting training. This will be quick because QUICK_RUN is enabled and dataset was subsampled.")
train_result = trainer.train()
trainer.save_state()

# Save model and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Training finished. Model and tokenizer saved to:", OUTPUT_DIR)
print("Train metrics:", train_result.metrics)


Starting training. This will be quick because QUICK_RUN is enabled and dataset was subsampled.


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33minfosysshreya3[0m ([33minfosysshreya3-government-college-of-engineering[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


Training finished. Model and tokenizer saved to: /content/drive/MyDrive/fin_ner_model
Train metrics: {'train_runtime': 95.9402, 'train_samples_per_second': 20.846, 'train_steps_per_second': 1.303, 'total_flos': 42871712184000.0, 'train_loss': 0.10119731247425079, 'epoch': 1.0}


In [None]:
# CELL 10 - Evaluate on test set
print("Evaluating on test set...")
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test metrics:", test_metrics)


Evaluating on test set...


Test metrics: {'eval_loss': 0.12040021270513535, 'eval_precision': 0.8898963730569949, 'eval_recall': 0.89453125, 'eval_f1': 0.8922077922077922, 'eval_accuracy': 0.9794740218088518, 'eval_runtime': 1.3683, 'eval_samples_per_second': 365.409, 'eval_steps_per_second': 46.042, 'epoch': 1.0}


In [None]:
# --- Combined cell: evaluate + print predictions in tuple style (fixed for JSON) ---
import numpy as np
import json
import os
import evaluate

seqeval = evaluate.load("seqeval")

def preds_labels_from_logits(pred_logits, label_ids_batch, label_list):
    preds = np.argmax(pred_logits, axis=-1)
    all_true, all_pred = [], []
    for pred_seq, label_seq in zip(preds, label_ids_batch):
        pred_labels, true_labels = [], []
        for p_id, l_id in zip(pred_seq, label_seq):
            if l_id == -100:
                continue
            pred_labels.append(label_list[p_id])
            true_labels.append(label_list[l_id])
        all_pred.append(pred_labels)
        all_true.append(true_labels)
    return all_pred, all_true

def convert_to_python_types(obj):
    """Recursively convert numpy types to native Python types for JSON."""
    if isinstance(obj, dict):
        return {k: convert_to_python_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_python_types(v) for v in obj]
    elif isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    else:
        return obj

def evaluate_and_print(dataset, name="test", N_print=5):
    print(f"\n=== Evaluating {name} set ===")
    pred_output = trainer.predict(dataset)
    logits = pred_output.predictions
    label_ids = pred_output.label_ids

    pred_label_seqs, true_label_seqs = preds_labels_from_logits(logits, label_ids, label_list)

    # Overall metrics
    results = seqeval.compute(predictions=pred_label_seqs, references=true_label_seqs)
    results_safe = convert_to_python_types(results)  # convert for JSON

    print(f"\nOverall metrics ({name}):")
    print(json.dumps({
        "precision": results_safe.get("overall_precision"),
        "recall": results_safe.get("overall_recall"),
        "f1": results_safe.get("overall_f1"),
        "accuracy": results_safe.get("overall_accuracy")
    }, indent=2))

    # Save metrics
    metrics_file = os.path.join(OUTPUT_DIR if 'OUTPUT_DIR' in globals() else "./", f"{name}_metrics.json")
    with open(metrics_file, "w") as f:
        json.dump(results_safe, f, indent=2)
    print(f"Saved metrics to {metrics_file}")

    # Print first N examples in clean tuple format
    print(f"\n=== Sample predictions ({N_print} examples) ===")
    for i in range(min(N_print, len(dataset))):
        words = datasets[name][i]["tokens"]
        preds = pred_label_seqs[i]
        tuples = [(w, l) for w, l in zip(words, preds)]
        print("[")
        for word, label in tuples:
            print(f" ('{word}', '{label}'),")
        print("]\n")
    return results_safe, pred_label_seqs, true_label_seqs

# Run evaluation + print predictions
test_results, test_preds, test_refs = evaluate_and_print(tokenized_datasets["test"], name="test", N_print=5)



=== Evaluating test set ===



Overall metrics (test):
{
  "precision": 0.8898963730569949,
  "recall": 0.89453125,
  "f1": 0.8922077922077922,
  "accuracy": 0.9794740218088518
}
Saved metrics to /content/drive/MyDrive/fin_ner_model/test_metrics.json

=== Sample predictions (5 examples) ===
[
 ('25-1', 'O'),
 ('Barcelona', 'B-ORG'),
 ('Real', 'I-ORG'),
 ('Madrid', 'I-ORG'),
]

[
 ('W', 'O'),
 ('L', 'O'),
 ('T', 'O'),
 ('GF', 'O'),
 ('GA', 'O'),
 ('PTS', 'O'),
]

[
 ('ST', 'B-ORG'),
 ('LOUIS', 'I-ORG'),
 ('AT', 'O'),
 ('COLORADO', 'B-LOC'),
]

[
 ('after', 'O'),
 ('Saturday', 'O'),
 (''s', 'O'),
 ('matches', 'O'),
 ('(', 'O'),
 ('tabulated', 'O'),
 ('-', 'O'),
 ('played', 'O'),
 (',', 'O'),
 ('won', 'O'),
 (',', 'O'),
 ('drawn', 'O'),
 (',', 'O'),
 ('lost', 'O'),
 (',', 'O'),
]

[
 ('Italy', 'B-LOC'),
 ('commission', 'O'),
 ('concludes', 'O'),
 ('1997', 'O'),
 ('budget', 'O'),
 ('examination', 'O'),
 ('.', 'O'),
]



In [None]:
# CELL 11 - Quick inference example using saved model
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

loaded_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
loaded_model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR)
loaded_model.to(device)
loaded_model.eval()

def predict_tokens(sentence_tokens):
    # sentence_tokens: list of words, e.g. ["Apple", "is", "in", "Cupertino", "."]
    inputs = loaded_tokenizer(sentence_tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits.cpu().numpy()[0]
    pred_ids = logits.argmax(-1)
    word_ids = inputs["input_ids"].cpu().numpy()[0]  # not used directly
    # Use tokenizer.word_ids to map tokens -> words
    word_id_map = loaded_tokenizer(sentence_tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=MAX_LENGTH).word_ids(batch_index=0)
    results = []
    prev_word_idx = None
    for idx, word_idx in enumerate(word_id_map):
        if word_idx is None:
            continue
        if word_idx != prev_word_idx:
            results.append((sentence_tokens[word_idx], label_list[pred_ids[idx]]))
        prev_word_idx = word_idx
    return results

# Example
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Prediction example:", predict_tokens(example))


Prediction example: [('Barclays', 'B-ORG'), ('Bank', 'I-ORG'), ('plc', 'I-ORG'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')]


In [None]:
tests = [
    ["Apple", "Inc.", "announced", "a", "revenue", "of", "$89", "billion", "."],
    ["John", "Doe", "met", "Jane", "Smith", "in", "New", "York", "."],
    ["Tesla", "Motors", "was", "founded", "by", "Elon", "Musk", "."],
]

for t in tests:
    print("\n", t)
    print(predict_tokens(t))



 ['Apple', 'Inc.', 'announced', 'a', 'revenue', 'of', '$89', 'billion', '.']
[('Apple', 'B-ORG'), ('Inc.', 'I-ORG'), ('announced', 'O'), ('a', 'O'), ('revenue', 'O'), ('of', 'O'), ('$89', 'O'), ('billion', 'O'), ('.', 'O')]

 ['John', 'Doe', 'met', 'Jane', 'Smith', 'in', 'New', 'York', '.']
[('John', 'B-PER'), ('Doe', 'I-PER'), ('met', 'O'), ('Jane', 'B-PER'), ('Smith', 'I-PER'), ('in', 'O'), ('New', 'B-LOC'), ('York', 'I-LOC'), ('.', 'O')]

 ['Tesla', 'Motors', 'was', 'founded', 'by', 'Elon', 'Musk', '.']
[('Tesla', 'B-ORG'), ('Motors', 'I-ORG'), ('was', 'O'), ('founded', 'O'), ('by', 'O'), ('Elon', 'B-ORG'), ('Musk', 'I-PER'), ('.', 'O')]


In [None]:
def predict_tokens(sentence_tokens, debug=False):
    # sentence_tokens: list of words, e.g. ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
    # 1) Tokenize once and keep the tokenized object
    tokenized = loaded_tokenizer(
        sentence_tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
    )

    # 2) Get word_ids from the same tokenization (includes None for special tokens)
    word_id_map = tokenized.word_ids(batch_index=0)

    # 3) Move tensors to device and run model
    inputs = {k: v.to(device) for k, v in tokenized.items() if k in ["input_ids", "attention_mask", "token_type_ids"] or k.startswith("offset_")}
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits.cpu().numpy()[0]   # shape: (seq_len, num_labels)
    pred_ids = logits.argmax(axis=-1)          # length == seq_len

    # 4) Map first subtoken's prediction to the original word
    results = []
    prev_word_idx = None
    for token_idx, word_idx in enumerate(word_id_map):
        if word_idx is None:
            prev_word_idx = None
            continue
        if word_idx != prev_word_idx:
            # prediction for the first subtoken of this word
            label_id = int(pred_ids[token_idx])
            # use model.config.id2label to avoid mismatched label ordering
            label_name = loaded_model.config.id2label[label_id] if hasattr(loaded_model.config, "id2label") else label_list[label_id]
            results.append((sentence_tokens[word_idx], label_name))
        prev_word_idx = word_idx

    if debug:
        print("Tokens:", loaded_tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))
        print("Word ids:", word_id_map)
        print("Pred ids:", pred_ids.tolist())
        print("Results:", results)

    return results


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import os

# ✅ Correct path to your model directory on Drive
OUTPUT_DIR = "/content/drive/MyDrive/fin_ner_model"

# Check if path exists
assert os.path.isdir(OUTPUT_DIR), f"Model folder not found at {OUTPUT_DIR}"

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Load tokenizer and model explicitly from local directory
loaded_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, local_files_only=True)
loaded_model = AutoModelForTokenClassification.from_pretrained(OUTPUT_DIR, local_files_only=True)
loaded_model.to(device)
loaded_model.eval()

# ✅ Define prediction function
def predict_tokens(sentence_tokens, debug=False):
    """
    Predict NER tags for a list of words, handling subword alignment correctly.
    """
    tokenized = loaded_tokenizer(
        sentence_tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128
    )

    word_id_map = tokenized.word_ids(batch_index=0)
    inputs = {k: v.to(device) for k, v in tokenized.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits.cpu().numpy()[0]
    pred_ids = logits.argmax(axis=-1)

    results = []
    prev_word_idx = None
    for token_idx, word_idx in enumerate(word_id_map):
        if word_idx is None:
            continue
        if word_idx != prev_word_idx:
            label_id = int(pred_ids[token_idx])
            label_name = loaded_model.config.id2label[label_id]
            results.append((sentence_tokens[word_idx], label_name))
        prev_word_idx = word_idx

    if debug:
        print("Tokens:", loaded_tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))
        print("Word IDs:", word_id_map)
        print("Pred IDs:", pred_ids.tolist())
        print("Results:", results)

    return results

# ✅ Example usage
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
print("Prediction example:", predict_tokens(example))


Using device: cuda
Prediction example: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


In [None]:
print(loaded_model.config.id2label)
print(loaded_model.config.label2id)


{0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}
{'B-LOC': 7, 'B-MISC': 1, 'B-ORG': 5, 'B-PER': 3, 'I-LOC': 8, 'I-MISC': 2, 'I-ORG': 6, 'I-PER': 4, 'O': 0}


In [None]:
example = ["Barclays", "Bank", "plc", "is", "headquartered", "in", "London", "."]
predict_tokens(example, debug=True)


Tokens: ['[CLS]', 'Barclay', '##s', 'Bank', 'plc', 'is', 'headquartered', 'in', 'London', '.', '[SEP]']
Word IDs: [None, 0, 0, 1, 2, 3, 4, 5, 6, 7, None]
Pred IDs: [0, 3, 4, 4, 4, 0, 0, 0, 5, 0, 0]
Results: [('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


[('Barclays', 'B-PER'),
 ('Bank', 'I-PER'),
 ('plc', 'I-PER'),
 ('is', 'O'),
 ('headquartered', 'O'),
 ('in', 'O'),
 ('London', 'B-ORG'),
 ('.', 'O')]

In [None]:
from datasets import load_dataset

dataset = load_dataset("Josephgflowers/Financial-NER-NLP")

# Check the first example
print(dataset['train'][0])


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/367M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/900384 [00:00<?, ? examples/s]

{'system': 'Extract the named entities in this text using 139 XBRL tags in the IOB2 format. Return the results in JSON format.', 'user': 'ITEM 1 Financial Statements Lennar Corporation and Subsidiaries Condensed Consolidated Balance Sheets ( Dollars in thousands , except shares and per share amounts ) ( unaudited ) ( 1 ) Under certain provisions of Accounting Standards Codification ( “ ASC ” ) Topic 810 , Consolidations , ( “ ASC 810 ” ) the Company is required to separately disclose on its condensed consolidated balance sheets the assets owned by consolidated variable interest entities ( “ VIEs ” ) and liabilities of consolidated VIEs as to which neither Lennar Corporation , or any of its subsidiaries , has any obligations . As of May 31 , 2016 , total assets include $ 645.1 million related to consolidated VIEs of which $ 8.2 million is included in Lennar Homebuilding cash and cash equivalents , $ 0.1 million in Lennar Homebuilding receivables , net , $ 6.2 million in Lennar Homebuild

In [None]:
print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 900384
    })
})
{'system': 'Extract the named entities in this text using 139 XBRL tags in the IOB2 format. Return the results in JSON format.', 'user': 'ITEM 1 Financial Statements Lennar Corporation and Subsidiaries Condensed Consolidated Balance Sheets ( Dollars in thousands , except shares and per share amounts ) ( unaudited ) ( 1 ) Under certain provisions of Accounting Standards Codification ( “ ASC ” ) Topic 810 , Consolidations , ( “ ASC 810 ” ) the Company is required to separately disclose on its condensed consolidated balance sheets the assets owned by consolidated variable interest entities ( “ VIEs ” ) and liabilities of consolidated VIEs as to which neither Lennar Corporation , or any of its subsidiaries , has any obligations . As of May 31 , 2016 , total assets include $ 645.1 million related to consolidated VIEs of which $ 8.2 million is included in Lennar Homebuildi

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"  # or any seq2seq model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Example input
prompt = "Extract the named entities in this text using 139 XBRL tags in the IOB2 format. Return the results in JSON format.\n\nITEM 1 Financial Statements Lennar Corporation..."
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(**inputs, max_length=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

format. Return the results in JSON format. ITEM 1 Financial Statements Lennar Corporation...


In [None]:
def extract_financial_entities(text):
    words = text.split()
    preds = predict_tokens(words)

    entities = []
    current_entity = {"type": None, "tokens": []}

    for word, label in preds:
        if label.startswith("B-"):
            # Save previous entity
            if current_entity["type"]:
                entities.append({
                    "entity": current_entity["type"],
                    "text": " ".join(current_entity["tokens"])
                })
            current_entity = {"type": label[2:], "tokens": [word]}
        elif label.startswith("I-") and current_entity["type"] == label[2:]:
            current_entity["tokens"].append(word)
        else:
            if current_entity["type"]:
                entities.append({
                    "entity": current_entity["type"],
                    "text": " ".join(current_entity["tokens"])
                })
            current_entity = {"type": None, "tokens": []}

    # Append last entity
    if current_entity["type"]:
        entities.append({
            "entity": current_entity["type"],
            "text": " ".join(current_entity["tokens"])
        })

    return entities


In [None]:
import re

def regex_enhancement(text):
    patterns = {
        "revenue": r"revenue[s]?\s+(?:was|of|totaled)?\s*\$[\d,.]+\s*(million|billion)?",
        "net_income": r"net income\s+(?:was|of)?\s*\$[\d,.]+\s*(million|billion)?",
        "eps": r"earnings per share\s+(?:was|of)?\s*\$[\d.]+"
    }
    found = []
    for k, p in patterns.items():
        matches = re.findall(p, text, re.I)
        for m in matches:
            found.append({"entity": k.upper(), "text": m})
    return found


In [None]:
def smart_extract(text):
    model_entities = extract_financial_entities(text)
    regex_entities = regex_enhancement(text)

    all_entities = model_entities + regex_entities
    seen = set()
    final = []
    for e in all_entities:
        if e["text"].lower() not in seen:
            final.append(e)
            seen.add(e["text"].lower())
    return final


In [None]:
text = "Netflix reported revenue of $4.3 billion for the quarter ended March 31, 2025."
results = smart_extract(text)
for r in results:
    print(f"{r['entity']:>10} | {r['text']}")


       PER | Netflix
   REVENUE | billion


In [None]:
def fix_labels(results):
    corrected = []
    for word, label in results:
        if label == "B-PER" and word.lower() in ["netflix", "tesla", "google", "amazon", "apple"]:
            label = "B-ORG"
        corrected.append((word, label))
    return corrected


In [None]:
results = predict_tokens(example)
results = fix_labels(results)
print(results)


[('Barclays', 'B-PER'), ('Bank', 'I-PER'), ('plc', 'I-PER'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('London', 'B-ORG'), ('.', 'O')]


In [None]:
from datasets import Dataset

financial_data = [
    {
        "tokens": ["Netflix", "reported", "revenue", "of", "$4.3", "billion", "for", "the", "quarter", "ended", "March", "31", ",", "2025", "."],
        "ner_tags": ["B-ORG", "O", "B-METRIC", "O", "B-VALUE", "I-VALUE", "O", "O", "O", "O", "B-DATE", "I-DATE", "O", "I-DATE", "O"]
    },
    {
        "tokens": ["Net", "income", "was", "$558", "million", "as", "compared", "to", "$402", "million", "last", "year", "."],
        "ner_tags": ["B-METRIC", "I-METRIC", "O", "B-VALUE", "I-VALUE", "O", "O", "O", "B-VALUE", "I-VALUE", "B-DATE", "I-DATE", "O"]
    },
    {
        "tokens": ["Earnings", "per", "share", "was", "$12.5", "for", "the", "three", "months", "ended", "December", "31", ",", "2024", "."],
        "ner_tags": ["B-METRIC", "I-METRIC", "I-METRIC", "O", "B-VALUE", "O", "O", "O", "O", "O", "B-DATE", "I-DATE", "O", "I-DATE", "O"]
    }
]

dataset = Dataset.from_dict({
    "tokens": [d["tokens"] for d in financial_data],
    "ner_tags": [d["ner_tags"] for d in financial_data]
})
train_test = dataset.train_test_split(test_size=0.3)
train_ds, test_ds = train_test["train"], train_test["test"]


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "/content/drive/MyDrive/fin_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)


In [None]:
# Extend label set with financial tags
new_labels = ["O","B-ORG","I-ORG","B-METRIC","I-METRIC","B-VALUE","I-VALUE","B-DATE","I-DATE"]
label2id = {label:i for i,label in enumerate(new_labels)}
id2label = {i:label for label,i in label2id.items()}
model.config.id2label = id2label
model.config.label2id = label2id


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

from datasets import DatasetDict
tokenized = DatasetDict({
    "train": train_ds.map(tokenize_and_align_labels, batched=True),
    "test": test_ds.map(tokenize_and_align_labels, batched=True)
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
from PyPDF2 import PdfReader

pdf_path = "/content/Nflx-20250331.pdf"  # path in your Drive
reader = PdfReader(pdf_path)

text_data = ""
for page in reader.pages:
    text_data += page.extract_text() + "\n"

# Clean up excessive whitespace
import re
text_data = re.sub(r'\s+', ' ', text_data)
print(text_data[:1000])  # preview first 1000 chars


UNITED ST ATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q (Mark One) ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the quarterly period ended March 31, 2025 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to Commission File Number: 001-35727 Netflix, Inc. (Exact name of Registrant as specified in its charter) Delaware 77-0467272 (State or other jurisdiction of incorporation or organization)(I.R.S. Employer Identification Number) 121 Albright Way,Los Gatos,California 95032 (Address of principal executive offices) (Zip Code) (408) 540-3700 (Registrant’s telephone number, including area code) Securities registered pursuant to Section 12(b) of the Act: Title of each class Trading Symbol(s) Name of each exchange on which registered Common stock, par value $0.001 per share NFLX NASDAQ Global Select Market Indicate by check mark whether the regi

In [None]:
!pip install -q PyPDF2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import re
import nltk
nltk.download("punkt")
from nltk import sent_tokenize, word_tokenize

def auto_label_financial_text(text):
    sentences = sent_tokenize(text)
    data = []

    for sent in sentences:
        tokens = word_tokenize(sent)
        labels = ["O"] * len(tokens)

        for i, tok in enumerate(tokens):
            if re.match(r"\$[\d\.,]+", tok) or tok.lower() in ["million", "billion"]:
                labels[i] = "B-VALUE"
            elif tok.lower() in ["revenue", "income", "earnings", "expenses", "cash", "liabilities"]:
                labels[i] = "B-METRIC"
            elif re.match(r"\b(20\d{2})\b", tok):
                labels[i] = "B-DATE"

        data.append({"tokens": tokens, "ner_tags": labels})
    return data

financial_data = auto_label_financial_text(text_data)
print(financial_data[:5])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[{'tokens': ['UNITED', 'ST', 'ATES', 'SECURITIES', 'AND', 'EXCHANGE', 'COMMISSION', 'Washington', ',', 'D.C.', '20549', 'FORM', '10-Q', '(', 'Mark', 'One', ')', '☒', 'QUARTERLY', 'REPORT', 'PURSUANT', 'TO', 'SECTION', '13', 'OR', '15', '(', 'd', ')', 'OF', 'THE', 'SECURITIES', 'EXCHANGE', 'ACT', 'OF', '1934', 'For', 'the', 'quarterly', 'period', 'ended', 'March', '31', ',', '2025', 'OR', '☐', 'TRANSITION', 'REPORT', 'PURSUANT', 'TO', 'SECTION', '13', 'OR', '15', '(', 'd', ')', 'OF', 'THE', 'SECURITIES', 'EXCHANGE', 'ACT', 'OF', '1934', 'For', 'the', 'transition', 'period', 'from', 'to', 'Commission', 'File', 'Number', ':', '001-35727', 'Netflix', ',', 'Inc.', '(', 'Exact', 'name', 'of', 'Registrant', 'as', 'specified', 'in', 'its', 'charter', ')', 'Delaware', '77-0467272', '(', 'State', 'or', 'other', 'jurisdiction', 'of', 'incorporation', 'or', 'organization', ')', '(', 'I.R.S', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from datasets import Dataset
train_ds = Dataset.from_list(financial_data)
train_test = train_ds.train_test_split(test_size=0.2, seed=42)
train_ds, test_ds = train_test["train"], train_test["test"]


In [None]:
from transformers import AutoTokenizer

model_path = "/content/drive/MyDrive/fin_ner_model"  # your previously trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)

label_list = ["O", "B-METRIC", "B-VALUE", "I-VALUE", "B-DATE", "I-DATE"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=tokenizer.model_max_length, # Use model's max length
        padding="max_length" # Explicitly pad to max length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word:
                 # Check if the word_id is within the bounds of the original labels list
                if word_id < len(label):
                    label_ids.append(label2id.get(label[word_id], -100)) # Use .get for safe lookup
                else:
                    # If word_id is out of bounds, assign a padding label
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            prev_word = word_id

        # Ensure labels list has the same length as input_ids after padding
        # This step might be redundant if padding="max_length" is used correctly above
        # but adding defensively to ensure length match
        while len(label_ids) < len(tokenized_inputs["input_ids"][i]):
             label_ids.append(-100)
        # Truncate labels list if it's longer than input_ids (due to truncation)
        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from evaluate import load
import numpy as np

model_path = "/content/drive/MyDrive/fin_ner_model"  # your previously trained model

# Correct the label list to match the pre-trained model (CoNLL-2003 labels)
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}


model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
    }

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/netflix_finetuned_ner",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    eval_strategy="epoch", # Corrected key
    logging_dir="./logs",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.027406,0.9375,0.694444,0.797872
2,No log,0.006522,0.972727,0.990741,0.981651
3,No log,0.007332,0.963964,0.990741,0.977169
4,No log,0.007348,0.963964,0.990741,0.977169


TrainOutput(global_step=140, training_loss=0.04433912209102086, metrics={'train_runtime': 87.4045, 'train_samples_per_second': 12.722, 'train_steps_per_second': 1.602, 'total_flos': 290580382162944.0, 'train_loss': 0.04433912209102086, 'epoch': 4.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/netflix_final_model")


In [None]:
from transformers import pipeline

ner_pipe = pipeline("token-classification", model="/content/drive/MyDrive/netflix_final_model", tokenizer=tokenizer)
text = "Netflix reported revenues of $4.3 billion for the first quarter of 2025."
ner_pipe(text)


Device set to use cuda:0


[{'entity': 'I-PER',
  'score': np.float32(0.9970331),
  'index': 9,
  'word': 'billion',
  'start': 34,
  'end': 41},
 {'entity': 'I-ORG',
  'score': np.float32(0.99813676),
  'index': 15,
  'word': '202',
  'start': 67,
  'end': 70}]

In [None]:
def remap_entities(predictions):
    mapped = []
    for p in predictions:
        ent = p["entity"]
        word = p["word"].lower()

        if "billion" in word or "$" in word:
            ent = "VALUE"
        elif any(k in word for k in ["revenue", "income", "earnings"]):
            ent = "METRIC"
        elif any(k in word for k in ["202", "march", "december", "quarter"]):
            ent = "DATE"
        mapped.append({"entity": ent, "word": p["word"]})
    return mapped

text = "Netflix reported revenue of $4.3 billion for the first quarter of 2025."
preds = ner_pipe(text)
print(remap_entities(preds))


[{'entity': 'METRIC', 'word': 'revenue'}, {'entity': 'VALUE', 'word': 'billion'}, {'entity': 'DATE', 'word': '202'}]


In [None]:
def clean_predictions(preds):
    for p in preds:
        if p["entity"] == "DATE" and p["word"].isdigit() and len(p["word"]) == 3:
            p["word"] = p["word"] + "5"  # fix incomplete '202' → '2025'
    return preds


In [None]:
def merge_value_tokens(preds):
    merged = []
    skip_next = False
    for i, p in enumerate(preds):
        if skip_next:
            skip_next = False
            continue
        if p["entity"] == "VALUE" and i + 1 < len(preds) and preds[i + 1]["entity"] == "VALUE":
            merged.append({"entity": "VALUE", "word": f"{p['word']} {preds[i+1]['word']}"})
            skip_next = True
        else:
            merged.append(p)
    return merged


In [None]:
text = "Netflix reported revenue of $4.3 billion for the first quarter of 2025."
preds = ner_pipe(text)
preds = remap_entities(preds)
preds = merge_value_tokens(preds)
preds = clean_predictions(preds)
print(preds)


[{'entity': 'METRIC', 'word': 'revenue'}, {'entity': 'VALUE', 'word': 'billion'}, {'entity': 'DATE', 'word': '2025'}]


In [None]:
!pip install transformers datasets seqeval
import os, re, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments




In [None]:
DATA_DIR = "/content"
text_path = os.path.join(DATA_DIR, "netflix_q1_2025_text.txt")
with open(text_path) as f:
    raw_text = f.read()


In [None]:
def auto_label(text):
    patterns = {
        "METRIC": r"\b(revenue|net income|operating income|EPS|earnings per share)\b",
        "VALUE": r"\$\s?[\d,\.]+\s*(billion|million|m|b)?",
        "DATE": r"(?:March|June|September|December)\s+\d{1,2},\s+\d{4}"
    }
    tokens, labels = [], []
    for word in re.findall(r"\w+|\$[\d,\.]+|[^\w\s]", text):
        label = "O"
        for ent, pat in patterns.items():
            if re.fullmatch(pat, word, flags=re.I):
                label = f"B-{ent}"
                break
        tokens.append(word)
        labels.append(label)
    return {"tokens": tokens, "ner_tags": labels}

train_data = [auto_label(raw_text)]
dataset = Dataset.from_list(train_data)
print(dataset[0]["tokens"][:20])
print(dataset[0]["ner_tags"][:20])


['10', '/', '5', '/', '25', ',', '9', ':', '10', 'AM', 'nflx', '-', '20250331', 'UNITED', 'STATES', 'SECURITIES', 'AND', 'EXCHANGE', 'COMMISSION', 'Washington']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
MODEL_DIR = "/content/drive/MyDrive/fin_ner_model"  # your previous model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)

label_list = ["O", "B-METRIC", "B-VALUE", "B-DATE"]
id2label = {i:l for i,l in enumerate(label_list)}
label2id = {l:i for i,l in enumerate(label_list)}
model.config.id2label, model.config.label2id = id2label, label2id

def tokenize_align(examples):
    tokenized = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)
    labels = []
    for i, word_ids in enumerate(tokenized.word_ids(batch_index=0)):
        if word_ids is None:
            labels.append(-100)
        else:
            labels.append(label2id[examples["ner_tags"][word_ids]])
    tokenized["labels"] = labels
    return tokenized

tokenized_ds = dataset.map(tokenize_align)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
args = TrainingArguments(
    output_dir="/content/fin_ner_model_v2",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir="/content/logs",
    save_strategy="no"
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized_ds)
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.18664435545603433, metrics={'train_runtime': 0.4756, 'train_samples_per_second': 6.308, 'train_steps_per_second': 6.308, 'total_flos': 783939879936.0, 'train_loss': 0.18664435545603433, 'epoch': 3.0})

In [None]:
SAVE_DIR = "/content/drive/MyDrive/fin_ner_model_v2"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"✅ Saved fine-tuned model to {SAVE_DIR}")


✅ Saved fine-tuned model to /content/drive/MyDrive/fin_ner_model_v2


In [None]:
test_sentence = "Netflix reported revenue of $4.3 billion for the quarter ended March 31, 2025."
inputs = tokenizer(test_sentence, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model(**inputs)
pred_ids = outputs.logits.argmax(-1)[0].cpu().numpy()
for token, pred_id in zip(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]), pred_ids):
    print(f"{token:15} -> {id2label[pred_id]}")


[CLS]           -> O
Netflix         -> B-METRIC
reported        -> O
revenue         -> O
of              -> O
$               -> O
4               -> O
.               -> O
3               -> O
billion         -> O
for             -> O
the             -> O
quarter         -> O
ended           -> O
March           -> O
31              -> O
,               -> O
202             -> O
##5             -> O
.               -> O
[SEP]           -> O
