In [1]:
import zipfile
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

zip_file_path = "/content/drive/MyDrive/finer-139.zip"
extract_dir = "/content/drive/MyDrive" # Extract to the current directory

# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Extracted {zip_file_path} to {extract_dir}")

Extracted /content/drive/MyDrive/finer-139.zip to /content/drive/MyDrive


In [2]:
%pip install datasets transformers pandas tqdm evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b357ddef7a864201d69bdc715a9f32a45a676c56ace5f5c021f5f7df44465edb
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.6 seqeval-1.2.2


In [3]:
import json
import pandas as pd

file_path = "/content/drive/MyDrive/finer-139/test.jsonl"

# Peek at the file
with open(file_path, 'r', encoding='utf-8') as f:
    first_line = f.readline()
print(first_line)


{"id": 1012878, "tokens": ["The", "changes", "in", "the", "fair", "value", "of", "the", "derivatives", "and", "the", "related", "underlying", "foreign", "currency", "exposures", "resulted", "in", "net", "gains", "of", "$", "11", "million", "and", "$", "23", "million", "for", "the", "three", "months", "ended", "March", "31", ",", "2020", "and", "2019", ",", "respectively", ",", "that", "are", "recognized", "in", "Other", ",", "net", "expenses", "on", "the", "Consolidated", "Statements", "of", "Income", ".", "5", "."], "ner_tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}



In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={
    "train": "/content/drive/MyDrive/finer-139/train.jsonl",
    "validation": "/content/drive/MyDrive/finer-139/validation.jsonl",
    "test": "/content/drive/MyDrive/finer-139/test.jsonl"
})

print("Dataset loaded successfully!")
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully!
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 900384
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 112494
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 108378
    })
})


In [5]:
import transformers
from transformers import AutoTokenizer
from tqdm.auto import tqdm

# Step 1: Discover all unique NER tags from the entire dataset
print("Discovering all unique NER tags from your dataset...")
unique_ner_tags = set()
for split in dataset:
    for example in tqdm(dataset[split], desc=f"Scanning {split} split"):
        unique_ner_tags.update(example['ner_tags'])

# Step 2: Create the final, sorted list of labels, ensuring 'O' is first.
sorted_labels = sorted(list(unique_ner_tags))
if "O" in sorted_labels:
    sorted_labels.remove("O")
    label_names = ["O"] + sorted_labels
else:
    label_names = sorted_labels

print(f"\nDiscovered {len(label_names)} unique labels.")
# This will now include 'B-EquityMethodInvestments' and any others.

# --- THE REST OF THE CODE PROCEEDS WITH THE CORRECT LIST ---

# Step 3: Create the mappings from the discovered labels
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for i, label in enumerate(label_names)}

model_checkpoint = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 4: Use the robust tokenization function from before
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    all_labels = []
    for i, label_list in enumerate(examples[f"ner_tags"]):
        # Check if labels are strings and convert them to IDs if needed
        if label_list and isinstance(label_list[0], str):
            label_list = [label2id[label] for label in label_list]

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_list[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Step 5: Apply the mapping function
print("\nApplying tokenization and aligning labels...")
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    # Optional: Turn off caching if you suspect old, bad data is being used
    # load_from_cache_file=False
)
print("...Done!")

print("\nSuccessfully tokenized the dataset. Example of a tokenized sample:")
print(tokenized_datasets["train"][0])

Discovering all unique NER tags from your dataset...


Scanning train split:   0%|          | 0/900384 [00:00<?, ?it/s]

Scanning validation split:   0%|          | 0/112494 [00:00<?, ?it/s]

Scanning test split:   0%|          | 0/108378 [00:00<?, ?it/s]


Discovered 170 unique labels.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Applying tokenization and aligning labels...


Map:   0%|          | 0/900384 [00:00<?, ? examples/s]

Map:   0%|          | 0/112494 [00:00<?, ? examples/s]

Map:   0%|          | 0/108378 [00:00<?, ? examples/s]

...Done!

Successfully tokenized the dataset. Example of a tokenized sample:
{'id': 0, 'tokens': ['ITEM', '1', 'Financial', 'Statements', 'Lennar', 'Corporation', 'and', 'Subsidiaries', 'Condensed', 'Consolidated', 'Balance', 'Sheets', '(', 'Dollars', 'in', 'thousands', ',', 'except', 'shares', 'and', 'per', 'share', 'amounts', ')', '(', 'unaudited', ')', '(', '1', ')', 'Under', 'certain', 'provisions', 'of', 'Accounting', 'Standards', 'Codification', '(', '“', 'ASC', '”', ')', 'Topic', '810', ',', 'Consolidations', ',', '(', '“', 'ASC', '810', '”', ')', 'the', 'Company', 'is', 'required', 'to', 'separately', 'disclose', 'on', 'its', 'condensed', 'consolidated', 'balance', 'sheets', 'the', 'assets', 'owned', 'by', 'consolidated', 'variable', 'interest', 'entities', '(', '“', 'VIEs', '”', ')', 'and', 'liabilities', 'of', 'consolidated', 'VIEs', 'as', 'to', 'which', 'neither', 'Lennar', 'Corporation', ',', 'or', 'any', 'of', 'its', 'subsidiaries', ',', 'has', 'any', 'obligations', '.', '

In [6]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

# Load the model and configure it, telling it to ignore the final layer of the pre-trained model
# because our number of labels (170) is different from the original's (3).
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# A Data Collator will dynamically pad the inputs and labels in each batch
# to the same length, which is more efficient.
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

print("Model and Data Collator are ready.")
print("The pre-trained BERT body was loaded, and a new NER head for 170 labels was initialized.")

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([170, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([170]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and Data Collator are ready.
The pre-trained BERT body was loaded, and a new NER head for 170 labels was initialized.


In [7]:
import numpy as np
import evaluate  # <-- The fix is here

# Load the seqeval metric from the 'evaluate' library
metric = evaluate.load("seqeval") # <-- And here

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) and convert predictions/labels back to strings
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # The output of metric.compute() is a dictionary, we just need to pull the values out
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print("Metrics function is now defined using the 'evaluate' library.")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Metrics function is now defined using the 'evaluate' library.


In [None]:
from transformers import TrainingArguments, Trainer

print("Initializing TrainingArguments with legacy parameters for compatibility...")

# Using the workaround arguments that are compatible with your environment
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/finer-139/finer_ner_model", # Changed output directory
    # Using older arguments instead of 'evaluation_strategy'
    do_eval=True,                   # Explicitly enable evaluation
    eval_steps=5000,                # Evaluate every 5000 steps

    learning_rate=5e-5,  # Increased learning rate for potentially faster training
    per_device_train_batch_size=16, # Increased batch size
    per_device_eval_batch_size=16,  # Increased batch size
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=5000,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the training!
print("\n Starting model training... This will take a while. ")
trainer.train()

# Save the model explicitly after training
trainer.save_model("/content/drive/MyDrive/finer-139/finer_ner_model")

print("\n Training finished successfully! Your model is saved in the '/content/drive/MyDrive/finer-139/finer_ner_model' directory.")

Initializing TrainingArguments with legacy parameters for compatibility...

 Starting model training... This will take a while. 


  trainer = Trainer(


Step,Training Loss
5000,0.032
10000,0.0189
15000,0.0161
20000,0.0149
25000,0.014
30000,0.0131
35000,0.0122
40000,0.0119


In [8]:
from transformers import TrainingArguments, Trainer

print("Initializing TrainingArguments with legacy parameters for compatibility...")

# Using the workaround arguments that are compatible with your environment
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/finer-139/finer_ner_model", # Changed output directory
    # Using older arguments instead of 'evaluation_strategy'
    do_eval=True,                   # Explicitly enable evaluation
    eval_steps=5000,                # Evaluate every 5000 steps

    learning_rate=5e-5,  # Increased learning rate for potentially faster training
    per_device_train_batch_size=16, # Increased batch size
    per_device_eval_batch_size=16,  # Increased batch size
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=5000,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(60000)), # Use only the first 10000 examples
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the training!
print("\n Starting model training... This will take a while. ")
trainer.train()

# Save the model explicitly after training
trainer.save_model("/content/drive/MyDrive/finer-139/finer_ner_model")

print("\n Training finished successfully! Your model is saved in the '/content/drive/MyDrive/finer-139/finer_ner_model' directory.")

Initializing TrainingArguments with legacy parameters for compatibility...


  trainer = Trainer(



 Starting model training... This will take a while. 


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabinash9731[0m ([33mabinash9731-amrita-vishwa-vidyapeetham[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5000,0.0304
10000,0.0075
15000,0.003



 Training finished successfully! Your model is saved in the '/content/drive/MyDrive/finer-139/finer_ner_model' directory.


In [31]:
# Evaluate the model on a smaller subset of the test set
print("Evaluating the model on a smaller subset of the test dataset...")
# Reduce the evaluation batch size to avoid OutOfMemoryError
trainer.args.per_device_eval_batch_size = 4
# Evaluate on the first 1000 examples of the test set
results = trainer.evaluate(tokenized_datasets["test"].select(range(1000)))

print("\nEvaluation Results:")
print(results)

Evaluating the model on a smaller subset of the test dataset...



Evaluation Results:
{'eval_loss': 0.03848104178905487, 'eval_precision': 0.5072046109510087, 'eval_recall': 0.4861878453038674, 'eval_f1': 0.49647390691114246, 'eval_accuracy': 0.9941889843355229, 'eval_runtime': 8.0826, 'eval_samples_per_second': 123.723, 'eval_steps_per_second': 30.931, 'epoch': 4.0}


In [32]:
# Input your text here
my_text = """
Apple Inc. announced revenue of $94.8 billion for the second quarter of 2023,
a decrease of 3% year over year. Net income was $24.1 billion, and diluted
earnings per share were $1.52. The company reported a gross margin of 44.3%.
"""

# Use the pipeline to predict named entities on your text
my_prediction = ner_pipeline(my_text)

print("NER Pipeline Output for your text:")
print(my_prediction)

NER Pipeline Output for your text:
[{'entity': 'B-Revenues', 'score': np.float32(0.9310465), 'index': 8, 'word': '94', 'start': 34, 'end': 36}, {'entity': 'B-Revenues', 'score': np.float32(0.9031049), 'index': 9, 'word': '.', 'start': 36, 'end': 37}, {'entity': 'B-Revenues', 'score': np.float32(0.9212142), 'index': 10, 'word': '8', 'start': 37, 'end': 38}]


Let's look at the output from the last inference step:

In [33]:
# Example text for inference
text = "Revenue for the quarter was $1.2 billion."

# Use the pipeline to predict named entities
prediction = ner_pipeline(text)

print("NER Pipeline Output:")
print(prediction)

NER Pipeline Output:
[{'entity': 'B-Revenues', 'score': np.float32(0.4649795), 'index': 7, 'word': '1', 'start': 29, 'end': 30}, {'entity': 'B-Revenues', 'score': np.float32(0.5288543), 'index': 8, 'word': '.', 'start': 30, 'end': 31}, {'entity': 'B-Revenues', 'score': np.float32(0.6172208), 'index': 9, 'word': '2', 'start': 31, 'end': 32}]


In [34]:
from transformers import pipeline
import torch

# Create a NER pipeline using the trained model and tokenizer
# Specify the device to use (0 for GPU if available, -1 for CPU)
device = 0 if torch.cuda.is_available() else -1
ner_pipeline = pipeline("ner", model=loaded_model, tokenizer=loaded_tokenizer, device=device)

print("NER pipeline created successfully.")

Device set to use cuda:0


NER pipeline created successfully.


In [35]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Load the trained model and tokenizer
model_path = "/content/drive/MyDrive/finer-139/finer_ner_model"
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
loaded_model = AutoModelForTokenClassification.from_pretrained(model_path)

print(f"Model and tokenizer loaded from {model_path}")

Model and tokenizer loaded from /content/drive/MyDrive/finer-139/finer_ner_model


In [36]:
# Example text for inference
text = "Revenue for the quarter was $1.2 billion."

# Tokenize the input text
# The tokenizer expects a list of strings for is_split_into_words=True, so we wrap the text in a list
tokenized_input = loaded_tokenizer(text, return_tensors="pt", is_split_into_words=False)

# Move the tokenized input to the same device as the model (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}

# Perform inference
loaded_model.eval() # Set the model to evaluation mode
with torch.no_grad(): # Disable gradient calculation for inference
    outputs = loaded_model(**tokenized_input)

# Get the predicted token tags
predictions = torch.argmax(outputs.logits, dim=2)

# Convert predicted token ids back to labels
predicted_labels = [loaded_model.config.id2label[p.item()] for p in predictions[0]]

print("Predicted labels for the tokens:")
print(predicted_labels)

# Align predictions with original words (optional, but often useful)
# This requires a bit more complex logic to handle subword tokenization.
# For simplicity, we'll just show the token-level predictions above.
# If you need word-level predictions, you would need to adapt the tokenize_and_align_labels function
# or use a library like transformers' pipeline which handles this automatically.

Predicted labels for the tokens:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Revenues', 'B-Revenues', 'B-Revenues', 'O', 'O', 'O']


**Explanation of the output:**

The `predicted_labels` list shows the predicted NER tag for each token produced by the tokenizer. Note that due to subword tokenization, a single word might be split into multiple tokens, each with its own predicted tag.

For example, if the word "billion" was split into "bill" and "ion", you would see a tag for both "bill" and "ion".

To get word-level predictions, you would typically need to:
1. Map the token predictions back to the original words using the `word_ids` information from the tokenizer's output (similar to the `tokenize_and_align_labels` function).
2. Apply a strategy to determine the word's overall tag (e.g., use the tag of the first token of the word).

The `transformers` library provides a `pipeline` function that can handle this complexity for you, making it easier to get word-level predictions.