In [None]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
# ============================================
# üß† FINANCIAL NER MODEL ‚Äî STEP 1
# Importing Required Libraries
# ============================================

# --- Core data handling ---
import os
import json
import pandas as pd
import numpy as np

# --- Machine learning & model tools ---
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# --- Deep learning / Transformers ---
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import pipeline

# --- Dataset utilities ---
from datasets import Dataset
from evaluate import load # Corrected import

# --- General utilities ---
from tqdm import tqdm
import torch
import pickle
import matplotlib.pyplot as plt

print("‚úÖ Libraries imported successfully.")

‚úÖ Libraries imported successfully.


In [None]:
# ============================================
# üß† STEP 1 ‚Äî Load, Clean, and Process Parquet File
# ============================================

import pandas as pd
import re
import ast
from datasets import Dataset

# --- Path to your parquet file ---
parquet_path = "/content/dataset.parquet"  # change if needed

# --- Load parquet file ---
df = pd.read_parquet(parquet_path)
print(f"‚úÖ Parquet file loaded successfully with shape: {df.shape}")
print(f"üîπ Columns: {list(df.columns)}")

# --- STEP 2: Basic Cleaning ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Try to find a text column automatically
text_col = None
for col in df.columns:
    if "text" in col.lower() or "sentence" in col.lower() or "content" in col.lower():
        text_col = col
        break

if text_col is None:
    raise ValueError("‚ùå No text column found automatically. Please specify one manually, e.g. df['text_column'].")

df["sentence"] = df[text_col].apply(clean_text)

# --- STEP 3: Handle Labels (if available) ---
# Some parquet datasets may already have entity tags or BIO labels
label_col = None
for col in df.columns:
    if "tag" in col.lower() or "label" in col.lower():
        label_col = col
        break

if label_col:
    print(f"‚úÖ Found label column: {label_col}")
    # Try to safely parse list-like strings
    def safe_parse_list(val):
        try:
            if isinstance(val, list):
                return val
            if isinstance(val, str) and val.startswith("["):
                return ast.literal_eval(val)
        except Exception:
            pass
        return ["O"]  # default if parsing fails
    df["tag"] = df[label_col].apply(safe_parse_list)
else:
    print("‚ö†Ô∏è No label column found. Defaulting all tags to 'O' (unlabeled).")
    df["tag"] = [["O"] for _ in range(len(df))]

# --- STEP 4: Drop empty or invalid rows ---
df = df.dropna(subset=["sentence"])
df = df[df["sentence"].str.strip() != ""]
print(f"‚úÖ Cleaned dataset shape: {df.shape}")

# --- STEP 5: Generate label mappings ---
unique_tags = sorted(set(tag for tags in df["tag"] for tag in tags))
label2id = {label: i for i, label in enumerate(unique_tags)}
id2label = {i: label for label, i in label2id.items()}

# --- Encode labels ---
df["label_id"] = df["tag"].apply(lambda tags: [label2id.get(t, 0) for t in tags])

# --- STEP 6: Convert to Hugging Face Dataset ---
hf_dataset = Dataset.from_pandas(df[["sentence", "label_id"]])
print("‚úÖ Hugging Face dataset ready:")
print(hf_dataset)
print(f"üîπ Example sentence:\n{df['sentence'].iloc[0][:200]}...")
print(f"üîπ Example tags: {df['tag'].iloc[0]}")


‚úÖ Parquet file loaded successfully with shape: (70974, 169)
üîπ Columns: ['authors', 'date_download', 'date_modify', 'date_publish', 'description', 'filename', 'image_url', 'language', 'localpath', 'maintext', 'source_domain', 'title', 'title_page', 'title_rss', 'url', 'mentioned_companies', 'related_companies', 'industries', 'named_entities', 'prev_day_price_AAPL', 'next_day_price_AAPL', 'curr_day_price_AAPL', 'sentiment', 'emotion', 'news_outlet', 'prev_day_price_MA', 'next_day_price_MA', 'curr_day_price_MA', 'prev_day_price_T', 'next_day_price_T', 'curr_day_price_T', 'prev_day_price_VZ', 'next_day_price_VZ', 'curr_day_price_VZ', 'prev_day_price_GOOGL', 'next_day_price_GOOGL', 'curr_day_price_GOOGL', 'prev_day_price_C', 'next_day_price_C', 'curr_day_price_C', 'prev_day_price_BABA', 'next_day_price_BABA', 'curr_day_price_BABA', 'prev_day_price_AMZN', 'next_day_price_AMZN', 'curr_day_price_AMZN', 'prev_day_price_NFLX', 'next_day_price_NFLX', 'curr_day_price_NFLX', 'prev_day_price_WF

In [None]:
# ============================================
# ‚úÖ Clean, Tokenize, and Save BIO-tagged Dataset
# ============================================

import ast
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import Dataset
import json

# --- Load tokenizer ---
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# --- Check dataframe ---
if "named_entities" not in df.columns or "maintext" not in df.columns:
    raise ValueError("‚ùå Dataset must contain both 'maintext' and 'named_entities' columns!")

# --- Convert safely ---
def safe_eval(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

df["named_entities"] = df["named_entities"].apply(safe_eval)

# --- Truncate to max 512 tokens ---
MAX_LEN = 512
def truncate_text(text):
    tokens = tokenizer.tokenize(str(text))
    if len(tokens) > MAX_LEN:
        tokens = tokens[:MAX_LEN]
    return tokenizer.convert_tokens_to_string(tokens)

df["maintext"] = df["maintext"].astype(str).apply(truncate_text)
print(f"‚úÖ Truncated all long texts to max {MAX_LEN} tokens")

# --- BIO tagging ---
def create_bio_tags(sentence, entities):
    tokens = tokenizer.tokenize(sentence)
    labels = ["O"] * len(tokens)

    for ent in entities:
        if not isinstance(ent, dict):
            continue
        word = ent.get("word") or ent.get("entity") or ""
        label = ent.get("entity_group") or ent.get("label") or "O"

        if not word.strip() or label == "O":
            continue

        ent_tokens = tokenizer.tokenize(word)
        for i in range(len(tokens) - len(ent_tokens) + 1):
            if tokens[i : i + len(ent_tokens)] == ent_tokens:
                labels[i] = f"B-{label}"
                for j in range(1, len(ent_tokens)):
                    labels[i + j] = f"I-{label}"
                break

    return tokens, labels

bio_data = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        tokens, labels = create_bio_tags(row["maintext"], row["named_entities"])
        bio_data.append({"tokens": tokens, "ner_tags": labels})
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping row {i} due to error: {e}")

bio_df = pd.DataFrame(bio_data)
print("‚úÖ BIO tagging complete:", bio_df.shape)

# --- Merge ---
df["tokens"] = bio_df["tokens"]
df["ner_tags"] = bio_df["ner_tags"]

# --- Label mapping ---
unique_labels = sorted({tag for tags in df["ner_tags"] for tag in tags})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
print("üîπ Labels:", label2id)

# --- Tokenize and align labels ---
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[label[word_idx]])
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# --- Convert to HF Dataset ---
dataset = Dataset.from_pandas(df[["tokens", "ner_tags"]])
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["tokens", "ner_tags"])

# --- Split train/test ---
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print("‚úÖ Dataset ready for training:")
print(split_dataset)

# --- Save tokenized datasets & label mappings ---
train_dataset.save_to_disk("train_dataset")
test_dataset.save_to_disk("test_dataset")

with open("label_mappings.json", "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f, indent=4)

print("üíæ Saved tokenized datasets and label mappings successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors


‚úÖ Truncated all long texts to max 512 tokens


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70974/70974 [03:27<00:00, 342.29it/s]


‚úÖ BIO tagging complete: (70974, 2)
üîπ Labels: {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


Map:   0%|          | 0/70974 [00:00<?, ? examples/s]

‚úÖ Dataset ready for training:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 63876
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 7098
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/63876 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7098 [00:00<?, ? examples/s]

üíæ Saved tokenized datasets and label mappings successfully!


In [None]:
# ============================================
# üöÄ Fine-tune dslim/bert-base-NER (robust TrainingArguments)
# - Handles old/new transformers API by adding only supported kwargs
# - Trains 3 epochs, evaluates, prints accuracy/precision/recall/f1
# - Saves model folder and a .pkl with state_dict + mappings
# ============================================

from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from datasets import load_from_disk
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import json, pickle, inspect, os

# --- Paths (assumes you already saved these) ---
TRAIN_DS = "train_dataset"
TEST_DS = "test_dataset"
LABEL_MAP = "label_mappings.json"
MODEL_NAME = "dslim/bert-base-NER"
SAVE_DIR = "./fine_tuned_dslim_ner_model"  # folder to save full model/tokenizer
PKL_PATH = "financial_ner_model.pkl"       # pickle fallback

# --- Load data & mappings ---
print("üì¶ Loading tokenized datasets...")
train_dataset = load_from_disk(TRAIN_DS)
test_dataset = load_from_disk(TEST_DS)

with open(LABEL_MAP, "r") as fh:
    mappings = json.load(fh)
label2id = mappings["label2id"]
# id2label may be saved as dict of ints or strings ‚Äî normalize to int->str
id2label = {int(k): v for k, v in mappings["id2label"].items()}

print(f"‚úÖ Loaded datasets. #train={len(train_dataset)}, #test={len(test_dataset)}")
print("üîπ Labels:", label2id)

# --- Load tokenizer & model ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=False)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# --- Data collator ---
data_collator = DataCollatorForTokenClassification(tokenizer)

# --- Metrics function ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=2)

    true_preds = []
    true_labels = []
    for pred_row, label_row in zip(preds, labels):
        for p, l in zip(pred_row, label_row):
            if l != -100:
                true_preds.append(int(p))
                true_labels.append(int(l))

    if len(true_labels) == 0:
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average="weighted", zero_division=0)
    acc = accuracy_score(true_labels, true_preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# --- Build TrainingArguments kwargs robustly (only include supported params) ---
ta_params = dict(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    do_train=True,
    do_eval=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

# Add evaluation/save strategy only if supported by local transformers version
ta_sig = inspect.signature(TrainingArguments.__init__)
for maybe in ("evaluation_strategy", "eval_strategy", "eval_steps", "save_strategy"):
    if maybe in ta_sig.parameters:
        # map common name to canonical TrainingArguments param if needed
        if maybe == "evaluation_strategy" and "evaluation_strategy" in ta_sig.parameters:
            ta_params["evaluation_strategy"] = "epoch"
        elif maybe == "save_strategy" and "save_strategy" in ta_sig.parameters:
            ta_params["save_strategy"] = "epoch"
        # else skip if not present

# Create TrainingArguments
training_args = TrainingArguments(**{k: v for k, v in ta_params.items() if k in ta_sig.parameters})

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- Train ---
print("üî• Starting fine-tuning (dslim/bert-base-NER)...")
trainer.train()

# --- Evaluate ---
print("\nüìä Evaluating on test set...")
metrics = trainer.evaluate()
print("üìä Evaluation Metrics (from trainer.evaluate()):")
for k, v in metrics.items():
    if isinstance(v, (float, int)):
        print(f"  {k}: {v:.4f}")

# --- Save model & tokenizer properly (preferred) ---
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"üíæ Saved model & tokenizer to folder: {SAVE_DIR}")

# --- Also save a pickle with state_dict + mappings (optional) ---
pkl_data = {
    "model_state_dict": model.state_dict(),
    "label2id": label2id,
    "id2label": id2label,
    "tokenizer_name": SAVE_DIR,
}
with open(PKL_PATH, "wb") as f:
    pickle.dump(pkl_data, f)
print(f"üíæ Also saved model state + mappings to pickle: {PKL_PATH}")

# --- Done ---
print("‚úÖ Training + evaluation + save completed.")


üì¶ Loading tokenized datasets...
‚úÖ Loaded datasets. #train=63876, #test=7098
üîπ Labels: {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


üî• Starting fine-tuning (dslim/bert-base-NER)...


Step,Training Loss
100,0.6034
200,0.1005
300,0.0791
400,0.0796
500,0.0744
600,0.0709
700,0.0673
800,0.071
900,0.0634
1000,0.0643



üìä Evaluating on test set...


üìä Evaluation Metrics (from trainer.evaluate()):
  eval_loss: 0.0304
  eval_accuracy: 0.9892
  eval_precision: 0.9894
  eval_recall: 0.9892
  eval_f1: 0.9893
  eval_runtime: 62.6393
  eval_samples_per_second: 113.3160
  eval_steps_per_second: 14.1760
  epoch: 3.0000
üíæ Saved model & tokenizer to folder: ./fine_tuned_dslim_ner_model
üíæ Also saved model state + mappings to pickle: financial_ner_model.pkl
‚úÖ Training + evaluation + save completed.


In [None]:
# ============================================================
# üíæ Save the entire training session (model, tokenizer, mappings)
# ============================================================

import pickle
from transformers import BertForTokenClassification, BertTokenizerFast

save_path = "ner_training_session.pkl"

# Package all objects you‚Äôll need for reloading
training_artifacts = {
    "model_state_dict": model.state_dict(),
    "tokenizer": tokenizer,
    "label2id": label2id,
    "id2label": id2label,
    "training_args": training_args.to_dict()
}

# Save as a pickle file
with open(save_path, "wb") as f:
    pickle.dump(training_artifacts, f)

print(f"‚úÖ Training session saved successfully as {save_path}")


‚úÖ Training session saved successfully as ner_training_session.pkl


In [None]:
import pickle
from transformers import BertForTokenClassification, BertTokenizerFast

# Load pickle file
with open("ner_training_session.pkl", "rb") as f:
    saved = pickle.load(f)

# Rebuild model and tokenizer
model = BertForTokenClassification.from_pretrained(
    "dslim/bert-base-NER",
    num_labels=len(saved["label2id"]),
    id2label=saved["id2label"],
    label2id=saved["label2id"]
)
model.load_state_dict(saved["model_state_dict"])
tokenizer = saved["tokenizer"]

print("‚úÖ Model and tokenizer restored successfully!")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


‚úÖ Model and tokenizer restored successfully!


In [None]:
# ============================================================
# üìä Named Entity Extraction from Financial Reports
# ============================================================

import torch
import pickle
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.nn.functional import softmax
import numpy as np

# ------------------------------------------------------------
# 1Ô∏è‚É£ Load saved training session
# ------------------------------------------------------------
with open("ner_training_session.pkl", "rb") as f:
    saved = pickle.load(f)

model = BertForTokenClassification.from_pretrained(
    "dslim/bert-base-NER",
    num_labels=len(saved["label2id"]),
    id2label=saved["id2label"],
    label2id=saved["label2id"]
)
model.load_state_dict(saved["model_state_dict"])
tokenizer = BertTokenizerFast.from_pretrained("dslim/bert-base-NER")
model.eval()

label2id = saved["label2id"]
id2label = saved["id2label"]

print("‚úÖ Model and tokenizer loaded successfully!")


# ------------------------------------------------------------
# 2Ô∏è‚É£ Entity Extraction Function
# ------------------------------------------------------------
def extract_entities(text, model, tokenizer, id2label, confidence_threshold=0.85):
    tokens = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    with torch.no_grad():
        outputs = model(**tokens)
    probs = softmax(outputs.logits, dim=-1)
    pred_ids = torch.argmax(probs, dim=-1)[0].numpy()
    scores = torch.max(probs, dim=-1).values[0].numpy()

    tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    entities = []
    current_entity = None
    current_label = None
    current_score = []

    for token, pred_id, score in zip(tokens_decoded, pred_ids, scores):
        label = id2label[int(pred_id)]
        if label == "O":
            if current_entity:
                entities.append({
                    "entity_group": current_label,
                    "word": current_entity.replace("##", ""),
                    "score": float(np.mean(current_score))
                })
                current_entity, current_label, current_score = None, None, []
            continue

        if label.startswith("B-"):
            if current_entity:
                entities.append({
                    "entity_group": current_label,
                    "word": current_entity.replace("##", ""),
                    "score": float(np.mean(current_score))
                })
            current_label = label[2:]
            current_entity = token
            current_score = [score]

        elif label.startswith("I-") and current_label == label[2:]:
            current_entity += token.replace("##", "")
            current_score.append(score)

    # Append any remaining entity
    if current_entity:
        entities.append({
            "entity_group": current_label,
            "word": current_entity.replace("##", ""),
            "score": float(np.mean(current_score))
        })

    # Filter low-confidence entities
    entities = [e for e in entities if e["score"] >= confidence_threshold]

    return entities


# ------------------------------------------------------------
# 3Ô∏è‚É£ Output Formatter
# ------------------------------------------------------------
def print_extracted_entities(entities):
    if not entities:
        print("‚ö†Ô∏è No high-confidence entities found.")
        return

    print("üîç Extracted Entities:")
    for ent in entities:
        print(f"{ent['entity_group']:<10} | {ent['word']:<25} | Score: {ent['score']:.3f}")


# ------------------------------------------------------------
# 4Ô∏è‚É£ Example Usage
# ------------------------------------------------------------
sample_report = """
Apple Inc. reported a net income of $99.8 billion for Q2 2023, with total revenue of $383 billion.
The company announced a quarterly dividend and highlighted strong sales for iPhone 14.
Overall growth increased by 5% year-over-year.
"""

entities = extract_entities(sample_report, model, tokenizer, id2label)
print_extracted_entities(entities)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


‚úÖ Model and tokenizer loaded successfully!
üîç Extracted Entities:
ORG        | Apple                     | Score: 0.855
MISC       | iPhone                    | Score: 0.999


In [None]:
!ls -lh


total 1.3G
-rw-r--r-- 1 root root 444M Nov  9 05:14 dataset.parquet
-rw-r--r-- 1 root root 412M Nov  9 07:16 financial_ner_model.pkl
drwxr-xr-x 2 root root 4.0K Nov  9 07:15 fine_tuned_dslim_ner_model
-rw-r--r-- 1 root root  424 Nov  9 05:34 label_mappings.json
-rw-r--r-- 1 root root 412M Nov  9 07:16 ner_training_session.pkl
drwxr-xr-x 4 root root 4.0K Nov  9 07:14 results
drwxr-xr-x 1 root root 4.0K Nov  5 14:33 sample_data
drwxr-xr-x 2 root root 4.0K Nov  9 05:34 test_dataset
drwxr-xr-x 2 root root 4.0K Nov  9 05:33 train_dataset


In [None]:
!zip -r /content/my_colab_files.zip /content

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2025.11.05/ (stored 0%)
  adding: content/.config/logs/2025.11.05/14.33.45.559498.log (deflated 58%)
  adding: content/.config/logs/2025.11.05/14.33.54.129583.log (deflated 56%)
  adding: content/.config/logs/2025.11.05/14.33.36.385956.log (deflated 58%)
  adding: content/.config/logs/2025.11.05/14.33.53.434728.log (deflated 57%)
  adding: content/.config/logs/2025.11.05/14.33.13.470069.log (deflated 93%)
  adding: content/.config/logs/2025.11.05/14.33.44.287731.log (deflated 86%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content