In [None]:
import pandas as pd

# Load your preprocessed dataset
df = pd.read_csv("financial_ner_labeled.csv")
# Check the data
print(df.head())
print(len(df))


        token       lemma    pos  label
0      united      united  PROPN      O
1      states      states  PROPN      O
2  securities  securities  PROPN      O
3    exchange    exchange  PROPN  B-ORG
4  commission  commission  PROPN  B-ORG
69932


In [None]:
sentences = []
labels = []

current_tokens = []
current_labels = []

for idx, row in df.iterrows():
    token = row['token']
    label = row['label']

    if pd.isna(token) or token.strip() == "":
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)
            current_tokens = []
            current_labels = []
    else:
        current_tokens.append(token)
        current_labels.append(label)
#group tokens and labels into full sentences
if current_tokens:
    sentences.append(current_tokens)
    labels.append(current_labels)

print(len(sentences))
print(sentences[0])
print(labels[0])


851
['united', 'states', 'securities', 'exchange', 'commission', 'washington', 'd.c', '20549', 'form', '40', 'f', 'form', '20', 'f', 'form', '40', 'f', 'check', 'one', 'registration', 'statement', 'pursuant', 'section', '12', 'securities', 'exchange', 'act', '1934']
['O', 'O', 'O', 'B-ORG', 'B-ORG', 'B-LOC', 'B-LOC', 'B-NUM', 'B-FORM', 'B-NUM', 'O', 'B-FORM', 'B-NUM', 'O', 'B-FORM', 'B-NUM', 'O', 'O', 'B-NUM', 'B-FORM', 'B-FORM', 'O', 'O', 'B-NUM', 'O', 'B-ORG', 'O', 'B-NUM']


In [None]:
# Get unique labels
unique_labels = list(set([l for sublist in labels for l in sublist]))
unique_labels.sort()
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}

# Map labels to IDs
labels_ids = [[label2id[l] for l in sent_labels] for sent_labels in labels]


In [None]:
from transformers import AutoTokenizer
#tokenize and alignlabels
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=128
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For wordpieces, assign -100 to ignore
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

tokenized_dataset = tokenize_and_align_labels(sentences, labels_ids)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split
#Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    sentences, labels_ids, test_size=0.2, random_state=42
)
#tokenization and Label Alignment
train_dataset = tokenize_and_align_labels(train_texts, train_labels)
test_dataset = tokenize_and_align_labels(test_texts, test_labels)


In [None]:
from transformers import AutoModelForTokenClassification
#Load Pretrained Model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
#Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    do_eval=True
)


In [None]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=771db78e6feba9ab370fdfeca7cd62f3dc8d66d91992bdda928b8850cec489ae
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from transformers import Trainer
#Trainer initialization:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
!pip install datasets
from datasets import Dataset




In [None]:
from datasets import Dataset
import pandas as pd


data = Dataset.from_dict({"tokens": sentences, "ner_tags": labels_ids})

# Split into train and test sets
data = data.train_test_split(test_size=0.2, seed=42)
train_dataset = data["train"]
test_dataset = data["test"]


In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(example["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]


Map:   0%|          | 0/680 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
from transformers import TrainingArguments
#training arguments setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    do_eval=True
)


In [None]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
#takes raw model outputs, aligns them with the true labels
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [None]:
from transformers import Trainer
#trainer intilization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "dryrun"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],  # 👈 disables wandb completely
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print(results)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.011954,0.989607,0.991898,0.990751,0.997104
2,No log,0.009893,0.989619,0.993056,0.991334,0.997285
3,No log,0.007771,0.991926,0.99537,0.993645,0.998009




{'eval_loss': 0.00777128990739584, 'eval_precision': 0.9919261822376009, 'eval_recall': 0.9953703703703703, 'eval_f1': 0.9936452917388793, 'eval_accuracy': 0.9980090497737557, 'eval_runtime': 70.4349, 'eval_samples_per_second': 2.428, 'eval_steps_per_second': 0.312, 'epoch': 3.0}


In [None]:

model.save_pretrained("./financial_ner_model")
tokenizer.save_pretrained("./financial_ner_model")



('./financial_ner_model/tokenizer_config.json',
 './financial_ner_model/special_tokens_map.json',
 './financial_ner_model/vocab.txt',
 './financial_ner_model/added_tokens.json',
 './financial_ner_model/tokenizer.json')

In [None]:
!ls financial_ner_model


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [None]:
!zip -r financial_ner_model.zip financial_ner_model
from google.colab import files
files.download("financial_ner_model.zip")


  adding: financial_ner_model/ (stored 0%)
  adding: financial_ner_model/model.safetensors (deflated 7%)
  adding: financial_ner_model/special_tokens_map.json (deflated 42%)
  adding: financial_ner_model/config.json (deflated 50%)
  adding: financial_ner_model/tokenizer.json (deflated 70%)
  adding: financial_ner_model/tokenizer_config.json (deflated 75%)
  adding: financial_ner_model/vocab.txt (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import os, sys, warnings

MODEL_PATH = "./financial_ner_model"

print("Files in model folder:")
print(os.listdir(MODEL_PATH))

use_safetensors = os.path.exists(os.path.join(MODEL_PATH, "model.safetensors"))
print("Use safetensors?", use_safetensors)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, use_safetensors=use_safetensors)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    print("✅ Model and pipeline loaded successfully.")
except Exception as e:
    print("❌ Model load failed:", e)
    raise


Files in model folder:
['model.safetensors', 'special_tokens_map.json', 'config.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt']
Use safetensors? True


Device set to use cpu


✅ Model and pipeline loaded successfully.


In [None]:
text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
print("Input:", text)
print("NER output:")
print(ner_pipeline(text))


Input: Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion.
NER output:
[{'entity_group': 'NUM', 'score': np.float32(0.98227537), 'word': '15', 'start': 22, 'end': 24}, {'entity_group': 'NUM', 'score': np.float32(0.9875859), 'word': '3', 'start': 83, 'end': 84}]


In [None]:
def extract_user_entities(text, user_entities):
    results = ner_pipeline(text)
    extracted = []
    for entity in results:
        for target in user_entities:
            # match either detected token or a direct substring (for company names)
            if target.lower() in entity["word"].lower() or target.lower() in text.lower():
                extracted.append({
                    "Entity": entity["word"],
                    "Label": entity["entity_group"],
                    "Score": float(round(entity["score"], 4))
                })
    return extracted

def extract_financial_events(text):
    events = []
    keywords = {
        "merger": "Merger/Acquisition",
        "acquisition": "Merger/Acquisition",
        "ipo": "Initial Public Offering",
        "earnings call": "Earnings Call",
        "stock split": "Stock Split",
        "dividend": "Dividend Announcement"
    }
    low = text.lower()
    for key, value in keywords.items():
        if key in low:
            events.append({"Event": value, "Keyword": key})
    return events


In [None]:
import yfinance as yf

def get_financial_data(ticker):
    if not ticker:
        return None
    try:
        stock = yf.Ticker(ticker)
        info = stock.info
        return {
            "Company": info.get("longName") or info.get("shortName"),
            "Current Price": info.get("currentPrice"),
            "Market Cap": info.get("marketCap"),
            "EPS": info.get("trailingEps"),
            "Revenue (TTM)": info.get("totalRevenue")
        }
    except Exception as e:
        return {"error": str(e)}


In [None]:
def full_financial_analysis(text, user_entities, ticker=None):
    print("🔹 Extracting user-defined entities...")
    entities = extract_user_entities(text, user_entities)

    print("🔹 Extracting financial events...")
    events = extract_financial_events(text)

    print("🔹 Fetching company financials...")
    data = get_financial_data(ticker) if ticker else None

    return {
        "Extracted Entities": entities,
        "Detected Events": events,
        "Company Financials": data
    }


In [None]:
import pandas as pd

sample_text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
user_entities = ["Apple Inc.", "revenue", "EPS", "market cap"]
ticker = "AAPL"

results = full_financial_analysis(sample_text, user_entities, ticker)

print("\n=== Extracted Entities ===")
print(results["Extracted Entities"])
print("\n=== Detected Events ===")
print(results["Detected Events"])
print("\n=== Company Financials ===")
print(results["Company Financials"])

pd.DataFrame(results["Extracted Entities"]).to_csv("user_entities.csv", index=False)
pd.DataFrame(results["Detected Events"]).to_csv("events.csv", index=False)
print("\nSaved user_entities.csv and events.csv to workspace. Use file panel to download.")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

sent_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

sentiment_pipeline = pipeline("sentiment-analysis", model=sent_model, tokenizer=sent_tokenizer)

# Analyze sentiment of your extracted text
sample_text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
sentiment = sentiment_pipeline(sample_text)

print("Financial Sentiment:", sentiment)




config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Financial Sentiment: [{'label': 'positive', 'score': 0.9544737339019775}]


In [None]:
text = "Apple Inc. reported a 15% revenue increase and announced the acquisition of Beats for $3 billion."
user_entities = ["Apple Inc.", "Beats", "revenue", "acquisition"]
analysis_results = full_financial_analysis(text, user_entities, ticker="AAPL")
#mnaual test
from pprint import pprint
pprint(analysis_results)


🔹 Extracting user-defined entities...
🔹 Extracting financial events...
🔹 Fetching company financials...
{'Company Financials': {'Company': 'Apple Inc.',
                        'Current Price': 267.44,
                        'EPS': 7.46,
                        'Market Cap': 3968913899520,
                        'Revenue (TTM)': 416161005568},
 'Detected Events': [{'Event': 'Merger/Acquisition', 'Keyword': 'acquisition'}],
 'Extracted Entities': [{'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9778000116348267},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9778000116348267},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9778000116348267},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9778000116348267},
                       

In [None]:
#4#

In [None]:
#!/usr/bin/env python3
"""
parse_financial_pdf.py

Usage:
    python parse_financial_pdf.py path/to/report.pdf
"""

import sys
import json
import re
import math
from typing import List, Dict, Any, Optional
import tempfile
import os

# Third-party libs - install if needed:
# pip install pdfplumber camelot-py[cv] pytesseract opencv-python requests pandas
!pip install pytesseract opencv-python-headless
try:
    import pdfplumber
    import camelot
    import pytesseract
    import cv2
    import pandas as pd
    import numpy as np # Add numpy import for cv2.cvtColor
except Exception as e:
    print("Missing dependencies. Install: pdfplumber camelot-py[cv] pytesseract opencv-python pandas")
    raise

# ---------------------------
# ========== CONFIG =========
# ---------------------------
LLM_API_ENDPOINT = "https://your-llm-endpoint.example/v1/parse"  # replace with your LLM call wrapper if any
LLM_API_KEY = os.environ.get("LLM_API_KEY", "")  # if needed
NUMERIC_TOLERANCE_PCT = 0.5  # percent tolerance for asset vs liabilities+equity checks

# ---------------------------
# ======= MOCK/PLACEHOLDERS ==
# ---------------------------
# Replace this with your actual financial NER model instance or client.
class MockFinancialNER:
    def predict(self, text: str) -> List[Dict[str, Any]]:
        # Mock NER output format: list of {label, start, end, text, score}
        # Real model should return entity spans and labels like "ACCOUNT", "AMOUNT", "DATE", etc.
        # For demo: heuristically tag currency-like tokens and uppercase sequences
        entities = []
        # numbers
        for m in re.finditer(r"[\d\(\)\,\.]+", text):
            entities.append({"label": "AMOUNT", "start": m.start(), "end": m.end(), "text": m.group(), "score": 0.9})
        # very naive account detection
        if len(text.strip()) > 0 and any(w.lower() in text.lower() for w in ["cash", "receivable", "asset", "liabil", "revenue", "income", "equity", "profit", "expense"]):
            entities.append({"label": "ACCOUNT", "start": 0, "end": len(text), "text": text.strip(), "score": 0.85})
        return entities

financial_ner_model = MockFinancialNER()

def mock_llm_parse(prompt_payload: Dict[str, Any]) -> Dict[str, Any]:
    """
    Placeholder for LLM parsing: in production replace with a real LLM call.
    This mock tries to convert a CSV-like table payload into JSON with naive rules.
    """
    # The payload we'll send: {"caption":..., "csv":..., "ner_tags":...}
    csv_text = prompt_payload.get("csv", "")
    rows = [r for r in csv_text.splitlines() if r.strip()]
    header = []
    parsed = {"document_id": prompt_payload.get("document_id", "doc_mock"),
              "table_id": prompt_payload.get("table_id", "tbl_mock"),
              "type": "unknown",
              "units": None,
              "rows": []}
    if not rows:
        return parsed

    # assume first row is header if commas > 1
    first = rows[0]
    if "," in first:
        header = [h.strip() for h in first.split(",")]
        data_rows = rows[1:]
    else:
        header = ["Account"] + [f"Period_{i}" for i in range(1, len(rows[0].split()) )]
        data_rows = rows

    # naive mapping: first column is account name, rest are numbers
    for ridx, r in enumerate(data_rows):
        cols = [c.strip() for c in r.split(",")]
        if len(cols) == 0:
            continue
        account = cols[0]
        periods = {}
        for ci, val in enumerate(cols[1:], start=1):
            # create synthetic period labels: header[ci] if available
            label = header[ci] if ci < len(header) else f"period_{ci}"
            # try parse number
            num = parse_number(val)
            if num is not None:
                periods[label] = num
        row_out = {"row_index": ridx, "account_name": account, "normalized_account": account, "periods": periods, "confidence": 0.9, "notes": None}
        parsed["rows"].append(row_out)

    # simple heuristic: if caption contains 'balance' -> balance_sheet
    caption = prompt_payload.get("caption", "") or ""
    if "balance" in caption.lower():
        parsed["type"] = "balance_sheet"
    elif "income" in caption.lower() or "profit" in caption.lower() or "loss" in caption.lower():
        parsed["type"] = "income_statement"
    elif "cash flow" in caption.lower():
        parsed["type"] = "cash_flow"
    else:
        parsed["type"] = "other"
    # units
    if "mill" in caption.lower():
        parsed["units"] = "millions"
    elif "thousand" in caption.lower():
        parsed["units"] = "thousands"
    return parsed

# ---------------------------
# ======= UTIL FUNCTIONS =====
# ---------------------------
def parse_number(s: str, units_factor: float = 1.0) -> Optional[float]:
    """Parse numeric strings like '1,234', '(1,200)', '-', '—' returning float or None."""
    if s is None:
        return None
    t = str(s).strip()
    if t in ["", "-", "—", "na", "n/a", "—"]:
        return None
    # remove surrounding footnote markers, e.g., '1,234a' -> '1,234'
    t = re.sub(r"[a-zA-Z%]+$", "", t).strip()
    neg = False
    if t.startswith("(") and t.endswith(")"):
        neg = True
        t = t[1:-1]
    # sometimes values include currency symbols
    t = t.replace("$", "").replace("€", "").replace("₹", "").replace("£", "")
    t = t.replace(",", "").replace("\u2014", "").strip()
    if t == "":
        return None
    try:
        val = float(t) * units_factor
        return -val if neg else val
    except Exception:
        # fallback - try extracting first float-like sequence
        m = re.search(r"-?\d+(\.\d+)?", t)
        if m:
            try:
                val = float(m.group(0)) * units_factor
                return val
            except:
                return None
        return None

def safe_json_load(s: str) -> Any:
    try:
        return json.loads(s)
    except Exception:
        return None

# ---------------------------
# ======= EXTRACTION =========
# ---------------------------
def extract_tables_pdf(path: str) -> List[Dict[str, Any]]:
    """
    Try Camelot lattice then stream; fallback to pdfplumber extract_tables.
    Returns list of dicts: {"table_id":..., "page":int, "caption": str|null, "df": pd.DataFrame}
    """
    tables_out = []
    try:
        # try lattice (best for ruled tables)
        camelot_tables = camelot.read_pdf(path, pages='all', flavor='lattice')
        if camelot_tables and len(camelot_tables) > 0:
            for i, t in enumerate(camelot_tables):
                df = t.df
                df = df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
                tables_out.append({"table_id": f"camelot_lattice_{i}", "page": t.page, "caption": None, "df": df})
    except Exception as e:
        # print("Camelot lattice failed:", e)
        pass

    # if no tables found, try stream
    if not tables_out:
        try:
            camelot_tables = camelot.read_pdf(path, pages='all', flavor='stream')
            if camelot_tables and len(camelot_tables) > 0:
                for i, t in enumerate(camelot_tables):
                    df = t.df
                    df = df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
                    tables_out.append({"table_id": f"camelot_stream_{i}", "page": t.page, "caption": None, "df": df})
        except Exception:
            pass

    # final fallback to pdfplumber
    if not tables_out:
        try:
            with pdfplumber.open(path) as pdf:
                table_counter = 0
                for p_idx, page in enumerate(pdf.pages, start=1):
                    try:
                        for tbl in page.extract_tables():
                            df = pd.DataFrame(tbl)
                            df = df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
                            tables_out.append({"table_id": f"pdfplumber_{table_counter}", "page": p_idx, "caption": None, "df": df})
                            table_counter += 1
                    except Exception:
                        continue
        except Exception as e:
            print("pdfplumber failed to open:", e)

    return tables_out

# If no tables found (likely scanned), produce an OCR text blob for each page
def ocr_pdf_pages_to_text(path: str) -> List[Dict[str, Any]]:
    """Return list of pages as {'page':n, 'text': '...'} using pytesseract (requires pdf -> images)."""
    pages_text = []
    try:
        import pdf2image
    except Exception:
        raise RuntimeError("Missing pdf2image. Install via `pip install pdf2image` and install poppler.")
    images = pdf2image.convert_from_path(path)
    for i, img in enumerate(images):
        # convert PIL image to OpenCV
        arr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray, config="--psm 1")
        pages_text.append({"page": i+1, "text": text})
    return pages_text

# ---------------------------
# ======= NER TAGGING ========-
# ---------------------------
def ner_tag_table_cells(table: pd.DataFrame, heading: Optional[str] = None) -> Dict[str, Any]:
    """
    For each cell in table DataFrame, run your financial_ner_model.predict(text)
    Returns dict { "cell_r_c": {"text":.., "ner": [...]}, ... }
    """
    tags = {}
    rows, cols = table.shape
    for r in range(rows):
        for c in range(cols):
            txt = str(table.iat[r, c]) if table.iat[r, c] is not None else ""
            # Add a little context (left / above / heading) to help model
            left = str(table.iat[r, c-1]) if c-1 >= 0 else ""
            above = str(table.iat[r-1, c]) if r-1 >= 0 else ""
            context = " | ".join(filter(None, [heading or "", left, above]))
            combined = txt + "\nCONTEXT:\n" + context if context else txt
            ner_out = financial_ner_model.predict(combined)
            tags[f"cell_{r}_{c}"] = {"text": txt, "ner": ner_out}
    return tags

# ---------------------------
# ======= LLM PROMPT BUILD ===
# ---------------------------
def build_prompt_payload(document_id: str, table_id: str, caption: Optional[str], df: pd.DataFrame, ner_tags: Dict[str, Any]) -> Dict[str, Any]:
    """
    Build a compact payload to send to the LLM: small CSV + NER tags + caption
    """
    # create CSV string from df (keep small)
    csv_rows = []
    for r in range(df.shape[0]):
        row = [str(df.iat[r, c]) for c in range(df.shape[1])]
        csv_rows.append(",".join(row))
    # add header if df has header-like first row (heuristic)
    # if first row contains 'Account' or date-like tokens we keep as header
    header = ""
    if df.shape[0] > 0:
        first_row = [str(df.iat[0, c]).lower() for c in range(df.shape[1])]
        if any("account" in x or "balance" in x or re.search(r"\d{4}", x) for x in first_row):
            header = ",".join([str(df.iat[0, c]) for c in range(df.shape[1])])
            csv_text = "\n".join(csv_rows)
        else:
            csv_text = "\n".join(csv_rows)
    else:
        csv_text = ""

    payload = {
        "document_id": document_id,
        "table_id": table_id,
        "caption": caption or "",
        "csv": csv_text,
        "ner_tags": ner_tags
    }
    return payload

# ---------------------------
# ======= LLM CALL ===========
# ---------------------------
def call_llm_parse(payload: Dict[str, Any]) -> Dict[str, Any]:
    """
    Replace this with your real LLM client call:
        - include prompt schema, examples (1-2 shot), and force JSON output.
    Here we call the mock.
    """
    # In production: send HTTP request or use your SDK to call LLM. E.g.:
    # headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
    # resp = requests.post(LLM_API_ENDPOINT, headers=headers, json=payload, timeout=30)
    # return resp.json()
    return mock_llm_parse(payload)

# ---------------------------
# ======= VALIDATION =========
# ---------------------------
def validate_balance_sheet(parsed: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Very small set of checks. Return list of checks with results.
    - assets_match: check if there's a row named 'Total assets' and 'Total liabilities and equity' (or Liabilities + Equity)
    """
    checks = []
    rows = parsed.get("rows", [])
    # helper to find total value for a period label
    def find_total(label_keywords: List[str]) -> Optional[Dict[str, float]]:
        for r in rows:
            name = r.get("account_name", "").lower()
            if any(k in name for k in label_keywords):
                return r.get("periods", {})
        return None

    assets = find_total(["total assets", "totalasset", "assets"])
    liab_eq = find_total(["total liabilities", "liabilities and equity", "liabilities and shareholders' equity", "liabilities and equity", "total liabilities and equity"])
    if assets and liab_eq:
        # compare for periods that intersect
        for per in set(assets.keys()).intersection(set(liab_eq.keys())):
            a = assets.get(per)
            b = liab_eq.get(per)
            if a is None or b is None:
                continue
            diff = abs(a - b)
            pct = (diff / max(abs(a), 1.0)) * 100.0
            checks.append({"check": "assets_vs_liab_eq", "period": per, "assets": a, "liab_eq": b, "diff": diff, "diff_pct": pct, "status": "pass" if pct <= NUMERIC_TOLERANCE_PCT else "fail"})
    else:
        checks.append({"check": "assets_vs_liab_eq", "status": "skipped", "reason": "no_totals_found"})
    return checks

# ---------------------------
# ======= MAIN PARSING FLOW ==
# ---------------------------
def parse_pdf_document(pdf_path: str, document_id: str = "doc_1") -> Dict[str, Any]:
    """
    High level driver: extract tables -> ner on cells -> llm parse -> postprocess -> validate
    """
    results = {"document_id": document_id, "tables": [], "warnings": []}
    tables = extract_tables_pdf(pdf_path)
    if not tables:
        results["warnings"].append("No tables found by Camelot/pdfplumber. Try OCR/scan fallback.")
        # In production you may want to call ocr_pdf_pages_to_text() and then attempt table detection on images.
        return results

    for t in tables:
        table_id = t.get("table_id")
        page = t.get("page")
        df: pd.DataFrame = t.get("df")
        caption = t.get("caption", "") or ""
        # run NER per cell
        ner_tags = ner_tag_table_cells(df, heading=caption)
        # build LLM prompt/payload
        payload = build_prompt_payload(document_id, table_id, caption, df, ner_tags)
        # call LLM to parse into canonical JSON
        parsed = call_llm_parse(payload)
        # post-process numbers: convert keys like "Mar 31 2024" -> ISO style if possible
        for row in parsed.get("rows", []):
            new_periods = {}
            for per_label, val in row.get("periods", {}).items():
                # try to normalize period label - if it's a year or a date parse, else keep as-is
                norm_label = normalize_period_label(per_label)
                new_periods[norm_label] = val
            row["periods"] = new_periods

        # run validation if balance sheet
        validation = []
        if parsed.get("type") == "balance_sheet":
            validation = validate_balance_sheet(parsed)
        # store
        results["tables"].append({"table_id": table_id, "page": page, "caption": caption, "parsed": parsed, "validation": validation})
    return results

def normalize_period_label(label: str) -> str:
    """Try to convert various period labels to ISO date (YYYY-MM-DD) or YYYY if only year found. Fallback: original stripped."""
    label = str(label).strip()
    # Try year:
    m = re.search(r"(20\d{2}|19\d{2})", label)
    if m and len(label) <= 6:
        return m.group(0)
    # Try common formats like 'Mar 31 2024' or '31 Mar 2024'
    try:
        import dateutil.parser as dp
        dt = dp.parse(label, fuzzy=True, default=None)
        if dt:
            return dt.date().isoformat()
    except Exception:
        pass
    return label

# ---------------------------
# ======= CLI ENTRY ==========-
# ---------------------------
def main_cli(argv):
    if len(argv) < 2:
        print("Usage: python parse_financial_pdf.py path/to/report.pdf")
        return 1
    pdf_path = argv[1]
    if not os.path.exists(pdf_path):
        print("File not found:", pdf_path)
        return 1
    print("Parsing PDF:", pdf_path)
    out = parse_pdf_document(pdf_path, document_id=os.path.basename(pdf_path))
    # pretty print
    print("\n=== PARSE RESULTS ===")
    print(json.dumps(out, indent=2, default=str))
    # write to file
    out_path = os.path.splitext(pdf_path)[0] + ".parsed.json"
    with open(out_path, "w") as f:
        json.dump(out, f, indent=2, default=str)
    print("Saved parsed output to:", out_path)
    return 0




In [None]:
# 1) Upload a PDF interactively (or use Drive path)
from google.colab import files
uploaded = files.upload()   # choose the PDF from your computer

# get the uploaded filename
pdf_path = next(iter(uploaded.keys()))
print("Uploaded:", pdf_path)

# 2) Import your pipeline and run it
# ensure parse_financial_pdf.py is in working directory
from parse_financial_pdf import parse_pdf_document   # adjust if function name differs

result = parse_pdf_document(pdf_path, document_id=pdf_path)
# pretty print result summary
import json
print(json.dumps(result, indent=2, default=str)[:2000])   # print first 2000 chars
# save output to file
out_json = pdf_path.rsplit('.',1)[0] + ".parsed.json"
with open(out_json, "w") as f:
    json.dump(result, f, indent=2, default=str)
print("Saved parsed JSON to:", out_json)


Saving 00001140361-09-008205.pdf to 00001140361-09-008205.pdf
Uploaded: 00001140361-09-008205.pdf


ModuleNotFoundError: No module named 'parse_financial_pdf'

In [None]:
# Colab cell: install system and python packages needed by camelot/pdfplumber/pdf2image
!apt-get update -qq
!apt-get install -y -qq poppler-utils ghostscript
!pip install -q camelot-py[cv] pdfplumber pdf2image pytesseract opencv-python-headless pandas python-dateutil


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package fonts-droid-fallback.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../00-fonts-droid-fallback_1%3a6.0.1r16-1.1build1_all.deb ...
Unpacking fonts-droid-fallback (1:6.0.1r16-1.1build1) ...
Selecting previously unselected package poppler-data.
Preparing to unpack .../01-poppler-data_0.4.11-1_all.deb ...
Unpacking poppler-data (0.4.11-1) ...
Selecting previously unselected package fonts-noto-mono.
Preparing to unpack .../02-fonts-noto-mono_20201225-1build1_all.deb ...
Unpacking fonts-noto-mono (20201225-1build1) ...
Selecting previously unselected package fonts-urw-base35.
Preparing to unpack .../03-fonts-urw-base35_20200910-1_all.deb ...
Unpacking fonts-urw-base35 (20200910-1) ...
Selecting previously unselected pac

In [None]:
%%bash
cat > parse_financial_pdf.py <<'PY'
#!/usr/bin/env python3
"""
Minimal parse_financial_pdf.py
This script extracts tables using Camelot/pdfplumber and writes a JSON summary.
It is a light version intended for quick checks in Colab.
"""
import sys, os, json, re
import pandas as pd

try:
    import camelot
    import pdfplumber
except Exception as e:
    raise RuntimeError("Missing python deps: " + str(e))

def parse_number(s):
    if s is None: return None
    t = str(s).strip()
    if t in ["", "-", "—", "na", "n/a"]:
        return None
    t = re.sub(r"[^\d\-\.\(\)]+$", "", t)  # strip trailing non-numeric chars
    neg = False
    if t.startswith("(") and t.endswith(")"):
        neg = True
        t = t[1:-1]
    t = t.replace(",", "").strip()
    try:
        v = float(t)
        return -v if neg else v
    except:
        return None

def extract_tables_pdf(path):
    tables_out = []
    # try camelot lattice
    try:
        tables = camelot.read_pdf(path, pages='all', flavor='lattice')
        for i,t in enumerate(tables):
            df = t.df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
            tables_out.append({"table_id": f"camelot_lattice_{i}", "page": int(t.page), "df": df})
    except Exception:
        pass
    # try stream if none
    if not tables_out:
        try:
            tables = camelot.read_pdf(path, pages='all', flavor='stream')
            for i,t in enumerate(tables):
                df = t.df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
                tables_out.append({"table_id": f"camelot_stream_{i}", "page": int(t.page), "df": df})
        except Exception:
            pass
    # fallback to pdfplumber
    if not tables_out:
        try:
            with pdfplumber.open(path) as pdf:
                cnt = 0
                for p_idx, page in enumerate(pdf.pages, start=1):
                    try:
                        for tbl in page.extract_tables():
                            df = pd.DataFrame(tbl).fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
                            tables_out.append({"table_id": f"pdfplumber_{cnt}", "page": p_idx, "df": df})
                            cnt += 1
                    except Exception:
                        continue
        except Exception as e:
            pass
    return tables_out

def table_to_json(table):
    df = table["df"]
    rows = []
    for r in range(df.shape[0]):
        row = []
        for c in range(df.shape[1]):
            cell = df.iat[r,c]
            row.append(cell if cell is not None else "")
        rows.append(row)
    return {"table_id": table["table_id"], "page": table["page"], "n_rows": len(rows), "n_cols": df.shape[1], "sample_rows": rows[:5]}

def main(pdf_path):
    if not os.path.exists(pdf_path):
        print("File not found:", pdf_path); return 1
    tables = extract_tables_pdf(pdf_path)
    out = {"document": os.path.basename(pdf_path), "num_tables": len(tables), "tables": []}
    for t in tables:
        out["tables"].append(table_to_json(t))
    # write JSON beside pdf
    out_path = os.path.splitext(pdf_path)[0] + ".parsed.min.json"
    with open(out_path, "w") as f:
        json.dump(out, f, indent=2)
    print("Parsed", out_path)
    return 0

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python parse_financial_pdf.py <pdf_path>")
        sys.exit(1)
    sys.exit(main(sys.argv[1]))
PY


In [None]:
# Run the parser script (CLI)
!python3 parse_financial_pdf.py "p946.pdf"


  df = t.df.fillna("").applymap(lambda x: x.strip() if isinstance(x, str) else x)
Parsed p946.parsed.min.json


In [None]:
!pip install -q pdf2image
from pdf2image import convert_from_path
import pytesseract, cv2, numpy as np
pages = convert_from_path("00001140361-09-008205.pdf", dpi=200)
for i, pil_img in enumerate(pages[:3]):
    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config="--psm 1")
    print(f"\n--- Page {i+1} OCR sample ---\n")
    print(text[:1200])



--- Page 1 OCR sample ---

 

UNITED STATES SECURITIES AND EXCHANGE COMMISSION
WASHINGTON, D.C. 20549

FORM 40-F and FORM 20-F

Form 40-F
(Check One)

o REGISTRATION STATEMENT PURSUANT TO SECTION 12 OF THE SECURITIES EXCHANGE ACT OF 1934

OR

 

 

ql

 

ANNUAL REPORT PURSUANT TO SECTION 13(a) OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008

 

Commission File Number: 1-31349

Form 20-F
(Mark One)

o REGISTRATION STATEMENT PURSUANT TO SECTION 12(b) or (g) OF THE SECURITIES EXCHANGE ACT OF 1934

OR

 

 

 

iv] ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008

 

OR
o TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
OR
o SHELL COMPANY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

Commission File Number: 333-08354

THOMSON REUTERS

 

Form 40-F Form 20-F
(Exact name of Registrant as specifie

In [None]:
#evlauting####

In [None]:
# Quick smoke tests
def load_model():
    # Example: Hugging Face pipeline. Replace model name/path with yours.
    from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
    model_name = "./financial_ner_model"  # Corrected path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return ner_pipe

ner = load_model()

samples = [
    "Apple reported revenue of $89.5 billion in Q1 2024.",
    "Google announced acquisition of Fitbit for $2.1 billion.",
    "Net income was $(1,234) million in FY 2023."
]

for s in samples:
    print("TEXT:", s)
    preds = ner(s)   # if your model API differs, adapt this line
    print("PRED:", preds)
    print()


Device set to use cpu


TEXT: Apple reported revenue of $89.5 billion in Q1 2024.
PRED: [{'entity_group': 'NUM', 'score': np.float32(0.9994295), 'word': '89', 'start': 27, 'end': 29}, {'entity_group': 'NUM', 'score': np.float32(0.5132677), 'word': '.', 'start': 29, 'end': 30}, {'entity_group': 'NUM', 'score': np.float32(0.9929656), 'word': '5', 'start': 30, 'end': 31}, {'entity_group': 'NUM', 'score': np.float32(0.99277335), 'word': '##1', 'start': 44, 'end': 45}, {'entity_group': 'NUM', 'score': np.float32(0.99259067), 'word': '202', 'start': 46, 'end': 49}, {'entity_group': 'NUM', 'score': np.float32(0.9979188), 'word': '##4', 'start': 49, 'end': 50}]

TEXT: Google announced acquisition of Fitbit for $2.1 billion.
PRED: [{'entity_group': 'NUM', 'score': np.float32(0.9313989), 'word': '2', 'start': 44, 'end': 45}, {'entity_group': 'NUM', 'score': np.float32(0.94391304), 'word': '1', 'start': 46, 'end': 47}]

TEXT: Net income was $(1,234) million in FY 2023.
PRED: [{'entity_group': 'NUM', 'score': np.float32(

In [None]:
{"text": "Apple reported revenue of $89.5 billion in Q1 2024.", "entities":[{"start":0,"end":5,"label":"ORG"},{"start":23,"end":36,"label":"AMOUNT"},{"start":40,"end":48,"label":"DATE"}]}


{'text': 'Apple reported revenue of $89.5 billion in Q1 2024.',
 'entities': [{'start': 0, 'end': 5, 'label': 'ORG'},
  {'start': 23, 'end': 36, 'label': 'AMOUNT'},
  {'start': 40, 'end': 48, 'label': 'DATE'}]}

In [None]:
# Eval script (seqeval)
!pip install -q seqeval python-dateutil
!python -m spacy download en_core_web_sm
import json, spacy
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score

nlp = spacy.load("en_core_web_sm")

def spans_to_bio(text, spans):
    doc = nlp(text)
    tokens = [t.text for t in doc]
    labels = ["O"] * len(tokens)
    for s in spans:
        s_start, s_end, s_label = s["start"], s["end"], s["label"]
        # align to tokens
        start_idx = None
        end_idx = None
        for i,t in enumerate(doc):
            if t.idx <= s_start < t.idx+len(t.text): start_idx = i
            if t.idx < s_end <= t.idx+len(t.text): end_idx = i
        if start_idx is None:
            for i,t in enumerate(doc):
                if t.idx >= s_start:
                    start_idx = i; break
        if end_idx is None:
            for i,t in enumerate(list(doc)[::-1]): # Fix: convert to list before reversing
                real_i = len(doc)-1-i
                if t.idx + len(t.text) <= s_end:
                    end_idx = real_i; break
        if start_idx is not None and end_idx is not None and end_idx >= start_idx:
            labels[start_idx] = "B-"+s_label
            for j in range(start_idx+1, end_idx+1): labels[j] = "I-"+s_label
    return tokens, labels

# Create a dummy test_annotations.jsonl file for demonstration if it doesn't exist
annotations_file = "test_annotations.jsonl"
if not os.path.exists(annotations_file):
    print(f"Creating dummy {annotations_file} for evaluation.")
    dummy_data = [
        {"text": "Apple Inc. reported revenue of $89.5 billion in Q1 2024.", "entities": [{"start": 0, "end": 10, "label": "ORG"}, {"start": 27, "end": 38, "label": "NUM"}, {"start": 42, "end": 49, "label": "NUM"}]},
        {"text": "Google announced acquisition of Fitbit for $2.1 billion.", "entities": [{"start": 0, "end": 6, "label": "ORG"}, {"start": 41, "end": 52, "label": "NUM"}]},
        {"text": "Net income was $(1,234) million in FY 2023.", "entities": [{"start": 16, "end": 30, "label": "NUM"}, {"start": 33, "end": 40, "label": "NUM"}]}
    ]
    with open(annotations_file, "w") as f:
        for entry in dummy_data:
            f.write(json.dumps(entry) + "\n")

# load data
data = [json.loads(l) for l in open(annotations_file)]
y_true = []; y_pred = []
# load your predictor that returns spans like {'start','end','label'}
# adapt below to your model API
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# The model was saved to './financial_ner_model'
model_name = "./financial_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

for ex in data:
    text = ex["text"]
    true_spans = ex.get("entities", [])
    _, true_bio = spans_to_bio(text, true_spans)
    # predict
    preds = ner_pipe(text)   # [{'entity_group','word','start','end','score'},...]
    pred_spans = []
    for p in preds:
        # Ensure we only include valid spans with a label
        if 'entity_group' in p and p['entity_group'] is not None:
            pred_spans.append({"start":p["start"], "end":p["end"], "label":p["entity_group"]})
    _, pred_bio = spans_to_bio(text, pred_spans)
    y_true.append(true_bio)
    y_pred.append(pred_bio)

print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("\nDetailed:\n", classification_report(y_true, y_pred))


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Device set to use cpu


Precision: 0.0
Recall: 0.0
F1: 0.0

Detailed:
               precision    recall  f1-score   support

      AMOUNT       0.00      0.00      0.00         3
        DATE       0.00      0.00      0.00         2
         NUM       0.00      0.00      0.00         0
         ORG       0.00      0.00      0.00         2

   micro avg       0.00      0.00      0.00         7
   macro avg       0.00      0.00      0.00         7
weighted avg       0.00      0.00      0.00         7



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
            text += "\n"
    return text

pdf_path = "/content/00001140361-09-008205.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
print(pdf_text[:2000])  # show first 2000 chars


UNITED STATES SECURITIES AND EXCHANGE COMMISSION
WASHINGTON, D.C. 20549
FORM 40-F and FORM 20-F
Form 40-F
(Check One)
o REGISTRATION STATEMENT PURSUANT TO SECTION 12 OF THE SECURITIES EXCHANGE ACT OF 1934
OR
☑ ANNUAL REPORT PURSUANT TO SECTION 13(a) OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008
Commission File Number: 1-31349
Form 20-F
(Mark One)
o REGISTRATION STATEMENT PURSUANT TO SECTION 12(b) or (g) OF THE SECURITIES EXCHANGE ACT OF 1934
OR
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008
OR
o TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
OR
o SHELL COMPANY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
Commission File Number: 333-08354
Form 40-F Form 20-F
(Exact name of Registrant as specified in its charter) THOMSON
THOMSON REUTERS REUTERS
CORPORATION PLC
Ontario, Canada (Province 

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "/content/financial_ner_model"  # <-- Correct path

financial_ner_model = pipeline(
    "ner",
    model=AutoModelForTokenClassification.from_pretrained(model_name),
    tokenizer=AutoTokenizer.from_pretrained(model_name),
    aggregation_strategy="simple"
)

print("Model loaded successfully!")


Device set to use cpu


Model loaded successfully!


In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    return full_text

pdf_path = "/content/00001140361-09-008205.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

print(pdf_text[:1000])


UNITED STATES SECURITIES AND EXCHANGE COMMISSION
WASHINGTON, D.C. 20549
FORM 40-F and FORM 20-F
Form 40-F
(Check One)
o REGISTRATION STATEMENT PURSUANT TO SECTION 12 OF THE SECURITIES EXCHANGE ACT OF 1934
OR
☑ ANNUAL REPORT PURSUANT TO SECTION 13(a) OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008
Commission File Number: 1-31349
Form 20-F
(Mark One)
o REGISTRATION STATEMENT PURSUANT TO SECTION 12(b) or (g) OF THE SECURITIES EXCHANGE ACT OF 1934
OR
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2008
OR
o TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
OR
o SHELL COMPANY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
Commission File Number: 333-08354
Form 40-F Form 20-F
(Exact name of Registrant as specified in its charter) THOMSON
THOMSON REUTERS REUTERS
CORPORATION PLC
Ontario, Canada (Province 

In [None]:
ner_predictions = financial_ner_model(pdf_text)
ner_predictions[:20]


[{'entity_group': 'ORG',
  'score': np.float32(0.27714255),
  'word': 'C',
  'start': 63,
  'end': 64},
 {'entity_group': 'NUM',
  'score': np.float32(0.9964275),
  'word': '205',
  'start': 66,
  'end': 69},
 {'entity_group': 'NUM',
  'score': np.float32(0.99617344),
  'word': '##4',
  'start': 69,
  'end': 70},
 {'entity_group': 'NUM',
  'score': np.float32(0.99910456),
  'word': '##9',
  'start': 70,
  'end': 71},
 {'entity_group': 'NUM',
  'score': np.float32(0.99897265),
  'word': '40',
  'start': 77,
  'end': 79},
 {'entity_group': 'NUM',
  'score': np.float32(0.998632),
  'word': '20',
  'start': 91,
  'end': 93},
 {'entity_group': 'FORM',
  'score': np.float32(0.491563),
  'word': 'Form',
  'start': 96,
  'end': 100},
 {'entity_group': 'NUM',
  'score': np.float32(0.99924004),
  'word': '40',
  'start': 101,
  'end': 103},
 {'entity_group': 'NUM',
  'score': np.float32(0.99048376),
  'word': 'One',
  'start': 113,
  'end': 116},
 {'entity_group': 'NUM',
  'score': np.float32(0.

In [None]:
def highlight_entities(text, ents):
    for ent in ents:
        t = ent["word"]
        label = ent["entity_group"]
        text = text.replace(t, f"[{t} | {label}]")
    return text

highlighted = highlight_entities(pdf_text[:2000], ner_predictions)
print(highlighted)


UNITED STATES SE[C | ORG]URITIES AND EX[C | ORG]HANGE [C | ORG]OMMISSION
WASHINGTON[, | NUM] D.[C | ORG]. [[[[20 | NUM] | NUM] | NUM]5 | NUM]49
[F | NUM]ORM [[[40 | NUM] | NUM] | NUM]-[F | NUM] and [F | NUM]ORM [[[20 | NUM] | NUM] | NUM]-[F | NUM]
[[[F | NUM]orm | [F | NUM]ORM] | [F | NUM]ORM] [[[40 | NUM] | NUM] | NUM]-[F | NUM]
([C | ORG]heck [[One | NUM] | NUM])
o REGISTRATION STATEMENT PURSUANT TO SE[C | ORG]TION [[1 | NUM]2 | NUM] O[F | NUM] THE SE[C | ORG]URITIES EX[C | ORG]HANGE A[C | ORG]T O[F | NUM] [[[1 | NUM]9[3 | NUM]4 | NUM] | NUM]
OR
☑ ANNUAL REPORT PURSUANT TO SE[C | ORG]TION [[1 | NUM][3 | NUM] | NUM](a) OR [[1 | NUM]5 | NUM](d) O[F | NUM] THE SE[C | ORG]URITIES EX[C | ORG]HANGE A[C | ORG]T O[F | NUM] [[[1 | NUM]9[3 | NUM]4 | NUM] | NUM]
[F | NUM]or the fiscal year ended [December | NUM] [[3 | NUM][1 | NUM] | NUM][, | NUM] [[[20 | NUM] | NUM] | NUM][08 | NUM]
[C | ORG]ommission [F | NUM]ile Number: [1 | NUM]-[3 | NUM][[1 | NUM][3 | NUM] | NUM]49
[[[F | NUM]orm | [F | NU

In [None]:
# Single cell — paste & run in Colab
!pip install -q gradio pdfplumber transformers

import gradio as gr
import pdfplumber, os, json, re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import numpy as np

# ----- CONFIG: adjust if needed -----
MODEL_DIR = "/content/financial_ner_model"   # <-- ensure this path exists and contains your model files
# ------------------------------------

# ---- load model (once) ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)
# use aggregation_strategy="none" to see token-level then we will merge contiguous tokens;
# or "simple" to get aggregated entities from HF pipeline.
ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
print("NER model loaded from:", MODEL_DIR)

# ---- helpers ----
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for p in pdf.pages:
            t = p.extract_text()
            if t:
                text += t + "\n"
    return text

def tidy_scores(preds):
    # convert numpy floats to Python floats
    for p in preds:
        if "score" in p and (isinstance(p["score"], (np.floating, np.float32))):
            p["score"] = float(p["score"])
    return preds

# Optional: merge tokens if you used aggregation_strategy="none"
def clean_word(w):
    w = re.sub(r"^##", "", w)     # remove WordPiece ##
    w = w.replace("Ġ", "")        # roberta prefix
    return w

def gradio_parse(file):
    # file is a tempfile-like object from Gradio
    if file is None:
        return {"error": "No file uploaded"}
    try:
        txt = extract_text_from_pdf(file.name)
        if not txt.strip():
            return {"warning": "No selectable text found — the PDF may be scanned. Use OCR fallback."}
        preds = ner_pipe(txt)
        preds = tidy_scores(preds)
        return {"text_preview": txt[:3000], "entities": preds}
    except Exception as e:
        return {"error": str(e)}

# ---- Gradio UI ----
title = "FinanceInsight — NER-only Demo"
desc = "Upload a PDF (text-based). The app extracts text and returns detected entities (ORG, AMT, DATE, etc.)."

iface = gr.Interface(
    fn=gradio_parse,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.JSON(label="NER Output"),
    title=title,
    description=desc,
)

# Launch. Use share=True for a temporary public URL you can give to your sir.
iface.launch(share=True)


Device set to use cpu


NER model loaded from: /content/financial_ner_model
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e13b708e5a8a3cb5fd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')   # follow auth link

# save into Drive
model.save_pretrained("/content/drive/MyDrive/financeinsight/financial_ner_model")
tokenizer.save_pretrained("/content/drive/MyDrive/financeinsight/financial_ner_model")

# (Optional) zip in Drive
import shutil
shutil.make_archive("/content/drive/MyDrive/financeinsight/financial_ner_model", 'zip',
                    "/content/drive/MyDrive/financeinsight/financial_ner_model")


Mounted at /content/drive


'/content/drive/MyDrive/financeinsight/financial_ner_model.zip'

In [None]:
mkdir -p /content/FinanceInsightProject


In [None]:
!cp -r /content/financial_ner_model /content/FinanceInsightProject/


In [None]:
!cp /content/*.py /content/FinanceInsightProject/


In [None]:
!zip -r FinanceInsightProject.zip FinanceInsightProject


  adding: FinanceInsightProject/ (stored 0%)
  adding: FinanceInsightProject/parse_financial_pdf.py (deflated 63%)
  adding: FinanceInsightProject/financial_ner_model/ (stored 0%)
  adding: FinanceInsightProject/financial_ner_model/model.safetensors (deflated 7%)
  adding: FinanceInsightProject/financial_ner_model/special_tokens_map.json (deflated 42%)
  adding: FinanceInsightProject/financial_ner_model/config.json (deflated 50%)
  adding: FinanceInsightProject/financial_ner_model/tokenizer.json (deflated 70%)
  adding: FinanceInsightProject/financial_ner_model/tokenizer_config.json (deflated 75%)
  adding: FinanceInsightProject/financial_ner_model/vocab.txt (deflated 49%)


In [None]:
from google.colab import files
files.download("FinanceInsightProject.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>