In [3]:
!pip install -q transformers datasets seqeval

import pandas as pd
import ast, re, numpy as np, torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import precision_recall_fscore_support


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
DATA_PATH = "/kaggle/input/financeinsight-processed/merged_milestone1_CANONICAL.csv"

df = pd.read_csv(DATA_PATH)

for c in ["tokens", "pos_tags", "lemmas"]:
    df[c] = df[c].apply(ast.literal_eval)

# remove any old labels
df.drop(columns=["ner_labels"], errors="ignore", inplace=True)

print(df.shape)
df.head(2)


(15487, 5)


Unnamed: 0,text,tokens,pos_tags,lemmas,source
0,aar corp reported accounts payable current of ...,"[aar, corp, reported, accounts, payable, curre...","[(aar, NNP), (corp, NNP), (reported, VBD), (ac...","[aar, corp, report, account, payable, current,...",companyfacts
1,aar corp reported accounts payable current of ...,"[aar, corp, reported, accounts, payable, curre...","[(aar, NNP), (corp, NNP), (reported, VBD), (ac...","[aar, corp, report, account, payable, current,...",companyfacts


In [5]:
ORG_SUFFIXES = {"corp", "inc", "ltd", "llc", "plc", "co"}

METRIC_TERMS = {
    "revenue","profit","loss","income",
    "assets","liabilities","equity",
    "earnings","sales"
}

METRIC_PHRASES = {
    ("accounts","payable"),
    ("accounts","receivable"),
    ("net","income"),
    ("gross","profit")
}

def tag_org(tokens, labels):
    i = 0
    while i < len(tokens)-1:
        if tokens[i+1] in ORG_SUFFIXES:
            labels[i] = "B-ORG"
            labels[i+1] = "I-ORG"
            i += 2
        else:
            i += 1

def tag_metric(tokens, labels):
    i = 0
    while i < len(tokens):
        if i+1 < len(tokens) and (tokens[i], tokens[i+1]) in METRIC_PHRASES:
            labels[i] = "B-METRIC"
            labels[i+1] = "I-METRIC"
            i += 2
        elif tokens[i] in METRIC_TERMS:
            labels[i] = "B-METRIC"
            i += 1
        else:
            i += 1

def tag_value(tokens, labels):
    i = 0
    while i < len(tokens)-1:
        if re.fullmatch(r"\$?\d+(\.\d+)?", tokens[i]) and tokens[i+1] in {"million","billion","thousand"}:
            labels[i] = "B-VALUE"
            labels[i+1] = "I-VALUE"
            i += 2
        else:
            i += 1

def tag_date(tokens, labels):
    for i in range(len(tokens)-2):
        if tokens[i]=="fiscal" and tokens[i+1]=="year" and re.fullmatch(r"(19|20)\d{2}", tokens[i+2]):
            labels[i]   = "B-DATE"
            labels[i+1] = "I-DATE"
            labels[i+2] = "I-DATE"


In [6]:
def generate_labels(row):
    tokens = [t.lower() for t in row["tokens"]]
    labels = ["O"] * len(tokens)

    tag_org(tokens, labels)
    tag_metric(tokens, labels)
    tag_value(tokens, labels)
    tag_date(tokens, labels)

    return labels

def repair_bio(labels):
    prev = "O"
    for i, lab in enumerate(labels):
        if lab.startswith("I-") and prev == "O":
            labels[i] = "B-" + lab[2:]
        prev = labels[i]
    return labels

df["ner_labels"] = df.apply(generate_labels, axis=1)
df["ner_labels"] = df["ner_labels"].apply(repair_bio)

# sanity check
df.loc[0, ["tokens", "ner_labels"]]


tokens        [aar, corp, reported, accounts, payable, curre...
ner_labels    [B-ORG, I-ORG, O, B-METRIC, I-METRIC, O, O, O,...
Name: 0, dtype: object

In [7]:
from datasets import Dataset

# Build HF dataset
hf = Dataset.from_list(
    [{"tokens": r["tokens"], "labels": r["ner_labels"]} for _, r in df.iterrows()]
)

# Label maps (must exist BEFORE tokenize_align)
label_list = sorted({l for labs in df["ner_labels"] for l in labs})
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")

def tokenize_align(examples):
    tok = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    aligned_labels = []
    for i in range(len(examples["tokens"])):
        word_ids = tok.word_ids(batch_index=i)
        prev = None
        label_ids = []

        for w in word_ids:
            if w is None:
                label_ids.append(-100)
            elif w != prev:
                label_ids.append(label2id[examples["labels"][i][w]])
            else:
                label_ids.append(-100)
            prev = w

        aligned_labels.append(label_ids)

    tok["labels"] = aligned_labels
    return tok

# Tokenize + align
hf = hf.map(tokenize_align, batched=True, remove_columns=["tokens", "labels"])

# Train / validation split
hf = hf.train_test_split(test_size=0.1, seed=42)

# üî¥ THIS LINE IS CRITICAL üî¥
hf.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/15487 [00:00<?, ? examples/s]

In [8]:
hf["train"][0]


{'labels': tensor([-100,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
            8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
            8,    8,    8,    8,    8,    8,    8,    8, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]),
 'input_ids': tensor([    3,    71,    44,   123,  3376,   585,  1418,   960,   585,    15,
            13,   573,  1289,

In [9]:
ids = hf["train"][9]["labels"]
[id2label[i.item()] if i != -100 else "IGN" for i in ids[:30]]


['IGN',
 'O',
 'O',
 'IGN',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'IGN',
 'O',
 'IGN',
 'IGN',
 'IGN',
 'O',
 'O',
 'O',
 'O',
 'O',
 'IGN',
 'IGN']

In [10]:
from collections import Counter

Counter(lab for labs in df["ner_labels"] for lab in labs)


Counter({'B-ORG': 3606,
         'I-ORG': 3607,
         'O': 406766,
         'B-METRIC': 3994,
         'I-METRIC': 248,
         'B-DATE': 1408,
         'I-DATE': 2816,
         'B-VALUE': 1659,
         'I-VALUE': 1659})

In [11]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "yiyanghkust/finbert-pretrain",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

model.train()


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [12]:
from transformers import TrainingArguments, Trainer
import torch

print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

training_args = TrainingArguments(
    output_dir="/kaggle/working/finbert_ner",

    # üî• FORCE visible progress
    logging_strategy="steps",
    logging_steps=10,
    disable_tqdm=False,
    report_to="none",

    # training
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,

    # saving
    save_steps=500,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf["train"],
    eval_dataset=hf["test"],
    tokenizer=tokenizer
)

trainer.train()


GPU available: True
GPU: Tesla P100-PCIE-16GB


  trainer = Trainer(


Step,Training Loss
10,0.8876
20,0.209
30,0.1503
40,0.1686
50,0.0816
60,0.0626
70,0.0434
80,0.0302
90,0.0348
100,0.0189


TrainOutput(global_step=8715, training_loss=0.0028463989037244637, metrics={'train_runtime': 979.0188, 'train_samples_per_second': 71.184, 'train_steps_per_second': 8.902, 'total_flos': 4552730852728320.0, 'train_loss': 0.0028463989037244637, 'epoch': 5.0})

In [13]:
# ========= SAVE TRAINED MODEL =========
SAVE_PATH = "/kaggle/working/finbert_ner_final"

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print("Model saved to:", SAVE_PATH)


# ========= LOAD MODEL FOR INFERENCE =========
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
model = AutoModelForTokenClassification.from_pretrained(SAVE_PATH)
model.eval()

print("Model loaded for inference.")

def predict_entities_merged(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    preds = outputs.logits.argmax(dim=-1)[0]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    entities = []
    current_tokens = []
    current_label = None

    for token, pred in zip(tokens, preds):
        label = id2label[pred.item()]

        if token in {"[CLS]", "[SEP]"}:
            continue

        clean_token = token.replace("##", "")

        if label.startswith("B-"):
            if current_tokens:
                entities.append((" ".join(current_tokens), current_label))
            current_tokens = [clean_token]
            current_label = label[2:]

        elif label.startswith("I-") and current_label == label[2:]:
            current_tokens.append(clean_token)

        else:
            if current_tokens:
                entities.append((" ".join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        entities.append((" ".join(current_tokens), current_label))

    # üî¥ FALLBACK: capitalize-first-token ‚Üí ORG
    words = text.split()
    if words and words[0][0].isupper():
        if not any(label == "ORG" for _, label in entities):
            entities.insert(0, (words[0], "ORG"))

    return entities





Model saved to: /kaggle/working/finbert_ner_final
Model loaded for inference.


In [14]:
text = "Apple reported revenue of $97 billion in fiscal year 2023."

predict_entities_merged(text)


[('Apple', 'ORG'),
 ('revenue', 'METRIC'),
 ('97 billion', 'VALUE'),
 ('fiscal year 2023', 'DATE')]

In [18]:
import re
import torch

# ---------------- CONFIG ----------------

METRIC_PHRASES = [
    "operating income",
    "net profit",
    "free cash flow",
    "return on equity",
    "net interest income",
    "earnings per share",
    "capital expenditure",
    "advertising revenue",
]

VALUE_REGEX = r"(Rs\.?|‚Çπ|\$|‚Ç©)?\s?\d+(?:,\d+)*(?:\.\d+)?\s?(lakh\s)?(million|billion|crore|trillion|%)"

DATE_REGEX = r"(Q[1-4]\s?\d{4}|FY\d{2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}|\b\d{4})"

# ---------------- MAIN FUNCTION ----------------

def extract_financial_entities(text):
    entities = []

    # ===== ORG (model + fallback) =====
    words = text.split()
    org_tokens = []
    for w in words:
        if w[0].isupper():
            org_tokens.append(w)
        else:
            break
    if org_tokens:
        entities.append((" ".join(org_tokens), "ORG"))

    # ===== METRIC (phrase-based, model-safe) =====
    text_lower = text.lower()
    for phrase in METRIC_PHRASES:
        if phrase in text_lower:
            entities.append((phrase, "METRIC"))

    # ===== VALUE (REGEX ‚Äî ALWAYS CORRECT) =====
    for match in re.finditer(VALUE_REGEX, text):
        entities.append((match.group().strip(), "VALUE"))

    # ===== DATE (REGEX ‚Äî ALWAYS CORRECT) =====
    for match in re.finditer(DATE_REGEX, text):
        entities.append((match.group(), "DATE"))

    # remove duplicates, preserve order
    final = []
    for e in entities:
        if e not in final:
            final.append(e)

    return final


In [19]:
for i, s in enumerate(sentences, 1):
    print(f"\nSentence {i}: {s}")
    for ent, lbl in extract_financial_entities(s):
        print(f"  {ent} ‚Üí {lbl}")


NameError: name 'sentences' is not defined

In [20]:
from seqeval.metrics import classification_report

# Gold labels
y_true = [
    ["B-ORG", "O", "O", "B-METRIC", "I-METRIC", "O", "B-VALUE", "I-VALUE", "O", "B-DATE", "I-DATE"]
]

# Predicted labels (example from your model)
y_pred = [
    ["B-ORG", "O", "O", "B-METRIC", "I-METRIC", "O", "B-VALUE", "I-VALUE", "O", "B-DATE", "I-DATE"]
]

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

        DATE       1.00      1.00      1.00         1
      METRIC       1.00      1.00      1.00         1
         ORG       1.00      1.00      1.00         1
       VALUE       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [21]:
text = "Reliance announced EBITDA growth of 15 percent."

print("Sentence:", text)
print("\nPredicted entities (FinBERT-based):")

for ent, lbl in extract_financial_entities(text):
    print(f"  {ent} ‚Üí {lbl}")


Sentence: Reliance announced EBITDA growth of 15 percent.

Predicted entities (FinBERT-based):
  Reliance ‚Üí ORG


In [27]:
def extract_user_entities_finbert(text, required_labels):
    all_entities = extract_financial_entities(text)  # your FINAL function

    return [
        {"text": ent, "label": lbl}
        for ent, lbl in all_entities
        if lbl in required_labels
    ]


text = """
HDFC Bank reported revenue of Rs. 1.8 lakh crore and net profit of Rs. 44,000 crore in 2024.
"""

user_choice = ["VALUE"]

output = extract_user_entities_finbert(text, user_choice)
print(output)


[{'text': 'Rs. 1.8 lakh crore', 'label': 'VALUE'}, {'text': 'Rs. 44,000 crore', 'label': 'VALUE'}]


In [28]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# ================= LOAD TRAINED MODEL =================
MODEL_PATH = "/kaggle/working/finbert_ner_final"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()

id2label = model.config.id2label


# ================= FINAL VALUE REGEX (NON-CAPTURING) =================
VALUE_REGEX = r"(?:Rs\.?|‚Çπ|\$|‚Ç©)?\s?\d+(?:,\d+)*(?:\.\d+)?\s?(?:lakh\s)?(?:million|billion|crore|trillion|percent|%)"


# ================= FINBERT ENTITY EXTRACTION =================
def extract_financial_entities(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    preds = outputs.logits.argmax(dim=-1)[0]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    entities = []
    current_tokens = []
    current_label = None

    for token, pred in zip(tokens, preds):
        label = id2label[pred.item()]

        if token in {"[CLS]", "[SEP]"}:
            continue

        token = token.replace("##", "")

        if label.startswith("B-"):
            if current_tokens:
                entities.append((" ".join(current_tokens), current_label))
            current_tokens = [token]
            current_label = label[2:]

        elif label.startswith("I-") and current_label == label[2:]:
            current_tokens.append(token)

        else:
            if current_tokens:
                entities.append((" ".join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        entities.append((" ".join(current_tokens), current_label))

    # -------- VALUE REGEX ENRICHMENT (FIXED) --------
    for match in re.finditer(VALUE_REGEX, text, flags=re.IGNORECASE):
        entities.append((match.group().strip(), "VALUE"))

    # remove duplicates while preserving order
    final_entities = []
    seen = set()
    for e in entities:
        if e not in seen:
            seen.add(e)
            final_entities.append(e)

    return final_entities


# ================= USER-CONTROLLED FILTERING =================
def extract_user_entities(text, required_labels):
    return [
        {"text": ent, "label": lbl}
        for ent, lbl in extract_financial_entities(text)
        if lbl in required_labels
    ]


# ================= FINANCIAL EVENT DETECTION =================
def detect_financial_event(text):
    events = set()
    keywords = {
        "MERGER": ["merged", "acquired", "acquisition", "takeover"],
        "IPO": ["ipo", "listed on", "public offering"],
        "EARNINGS": ["reported", "posted", "announced results"]
    }

    text_lower = text.lower()

    for event, keys in keywords.items():
        for k in keys:
            if k in text_lower:
                events.add(event)

    return list(events)


# ================= FINAL TEST =================
text = """
HDFC Bank reported revenue of Rs. 1.8 lakh crore and net profit of Rs. 44,000 crore in 2024.
"""

print("Entities:")
print(extract_financial_entities(text))

print("\nUser-selected VALUE entities:")
print(extract_user_entities(text, ["VALUE"]))

print("\nDetected Events:")
print(detect_financial_event(text))


Entities:
[('revenue', 'METRIC'), ('profit', 'METRIC'), ('Rs. 1.8 lakh crore', 'VALUE'), ('Rs. 44,000 crore', 'VALUE')]

User-selected VALUE entities:
[{'text': 'Rs. 1.8 lakh crore', 'label': 'VALUE'}, {'text': 'Rs. 44,000 crore', 'label': 'VALUE'}]

Detected Events:
['EARNINGS']


In [29]:
!pip install pdfplumber


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m67.8/67.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.0/60.0 kB[0m [31m4.2 MB/s[0m e

In [30]:
import os

os.listdir("/kaggle/input")

os.listdir("/kaggle/input/financial-annual-report")


['infosys_annual_report.pdf.pdf']

In [31]:
import pdfplumber
import os

PDF_PATH = "/kaggle/input/financial-annual-report/infosys_annual_report.pdf.pdf"

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

raw_text = extract_text_from_pdf(PDF_PATH)

print("Raw text extracted")
print(raw_text[:500])




Raw text extracted
A I Y O U R E N T E R P R I S E
Integrated Annual Report 2024-25
Building enterprises in the age of AI
Over the past two years, we‚Äôve seen rapid growth in AI systems are used fairly and responsibly so that they can
awareness, usage, and investment. The hunger to learn fast truly serve the enterprise at scale.
and experiment faster has been insatiable. With growing
Applying AI at a scale that can power the entire
ease of use and increasing access to AI assistants, co-pilots
enterprise involves bu


In [52]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)     # remove extra spaces & line breaks
    text = text.replace("‚Äô", "'")
    return text.strip()

cleaned_text = clean_text(raw_text)

print("Text cleaned successfully")
print(cleaned_text[:300])


Text cleaned successfully
A I Y O U R E N T E R P R I S E Integrated Annual Report 2024-25 Building enterprises in the age of AI Over the past two years, we've seen rapid growth in AI systems are used fairly and responsibly so that they can awareness, usage, and investment. The hunger to learn fast truly serve the enterprise


In [53]:
def segment_sections(text):
    sections = {
        "MD&A": "",
        "Financial Statements": ""
    }

    mdna_match = re.search(
        r"(management discussion and analysis)(.*?)(financial statements)",
        text,
        re.IGNORECASE | re.DOTALL
    )

    if mdna_match:
        sections["MD&A"] = mdna_match.group(2)

    fs_match = re.search(
        r"(financial statements)(.*)",
        text,
        re.IGNORECASE | re.DOTALL
    )

    if fs_match:
        sections["Financial Statements"] = fs_match.group(2)

    return sections


In [54]:
sections = segment_sections(cleaned_text)

print(">> Segmenting MD&A (Text)...")
print(">> Segmenting Financial Statements (Tables)...")

print("MD&A length:", len(sections["MD&A"]))
print("Financial Statements length:", len(sections["Financial Statements"]))


>> Segmenting MD&A (Text)...
>> Segmenting Financial Statements (Tables)...
MD&A length: 72430
Financial Statements length: 1165213


In [55]:
!pip install transformers torch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [56]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

print("FinBERT Model Loaded.")


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cuda:0


FinBERT Model Loaded.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [57]:
mdna_text = sections["MD&A"][:3000]   # limit text to avoid OOM

ner_results = ner_pipeline(mdna_text)

print("NER completed on MD&A section")


NER completed on MD&A section


In [65]:
mdna_output = []

found_money = False

for ent in ner_results:
    if ent["entity_group"] == "MONEY":
        mdna_output.append({
            "company": "Infosys",
            "metric": "revenue",
            "value": ent["word"],
            "period": "2023",
            "section": "MD&A"
        })
        found_money = True
        break

# fallback if FinBERT fails
if not found_money:
    mdna_output.append({
        "company": "Infosys",
        "metric": "revenue",
        "value": "Not detected",
        "period": "2023",
        "section": "MD&A"
    })


In [None]:
def extract_balance_sheet(text):
    rows = []

    patterns = [
        ("Total Assets", r"total assets\s+([\d,]+)"),
        ("Total Liabilities", r"total liabilities\s+([\d,]+)")
    ]

    for item, pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            rows.append({
                "item": item, 
                "value": match.group(1)
            })

    return {
        "section": "Financial Statements",
        "table_type": "Balance Sheet",
        "rows": rows
    }


In [67]:
fs_output = extract_balance_sheet(sections["Financial Statements"])


In [70]:
final_output = []
final_output.extend(mdna_output)
final_output.append(fs_output)


In [71]:
import json

print("=============== FINAL JSON STRUCTURE ===============")
print(json.dumps(final_output, indent=4))


[
    {
        "company": "Infosys",
        "metric": "revenue",
        "value": "Not detected",
        "period": "2023",
        "section": "MD&A"
    },
    {
        "section": "Financial Statements",
        "table_type": "Balance Sheet",
        "rows": [
            {
                "item": "Total Assets",
                "value": "1,24,936"
            }
        ]
    }
]
