In [None]:
import pandas as pd

# Load your preprocessed dataset
df = pd.read_csv("financial_ner_labeled.csv")
# Check the data
print(df.head())
print(len(df))


        token       lemma    pos  label
0      united      united  PROPN      O
1      states      states  PROPN      O
2  securities  securities  PROPN      O
3    exchange    exchange  PROPN  B-ORG
4  commission  commission  PROPN  B-ORG
69932


In [None]:
sentences = []
labels = []

current_tokens = []
current_labels = []

for idx, row in df.iterrows():
    token = row['token']
    label = row['label']

    if pd.isna(token) or token.strip() == "":
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)
            current_tokens = []
            current_labels = []
    else:
        current_tokens.append(token)
        current_labels.append(label)
#group tokens and labels into full sentences
if current_tokens:
    sentences.append(current_tokens)
    labels.append(current_labels)

print(len(sentences))
print(sentences[0])
print(labels[0])


851
['united', 'states', 'securities', 'exchange', 'commission', 'washington', 'd.c', '20549', 'form', '40', 'f', 'form', '20', 'f', 'form', '40', 'f', 'check', 'one', 'registration', 'statement', 'pursuant', 'section', '12', 'securities', 'exchange', 'act', '1934']
['O', 'O', 'O', 'B-ORG', 'B-ORG', 'B-LOC', 'B-LOC', 'B-NUM', 'B-FORM', 'B-NUM', 'O', 'B-FORM', 'B-NUM', 'O', 'B-FORM', 'B-NUM', 'O', 'O', 'B-NUM', 'B-FORM', 'B-FORM', 'O', 'O', 'B-NUM', 'O', 'B-ORG', 'O', 'B-NUM']


In [None]:
# Get unique labels
unique_labels = list(set([l for sublist in labels for l in sublist]))
unique_labels.sort()
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}

# Map labels to IDs
labels_ids = [[label2id[l] for l in sent_labels] for sent_labels in labels]


In [None]:
from transformers import AutoTokenizer
#tokenize and alignlabels
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=128
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For wordpieces, assign -100 to ignore
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

tokenized_dataset = tokenize_and_align_labels(sentences, labels_ids)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split
#Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    sentences, labels_ids, test_size=0.2, random_state=42
)
#tokenization and Label Alignment
train_dataset = tokenize_and_align_labels(train_texts, train_labels)
test_dataset = tokenize_and_align_labels(test_texts, test_labels)


In [None]:
from transformers import AutoModelForTokenClassification
#Load Pretrained Model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
#Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    do_eval=True
)


In [None]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=e2c181890ae76c22b1f5abede3d393e347376c325eeb2e411340be480f56342d
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from transformers import Trainer
#Trainer initialization:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
!pip install datasets
from datasets import Dataset




In [None]:
from datasets import Dataset
import pandas as pd


data = Dataset.from_dict({"tokens": sentences, "ner_tags": labels_ids})

# Split into train and test sets
data = data.train_test_split(test_size=0.2, seed=42)
train_dataset = data["train"]
test_dataset = data["test"]


In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(example["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]


Map:   0%|          | 0/680 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
from transformers import TrainingArguments
#training arguments setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    do_eval=True
)


In [None]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
#takes raw model outputs, aligns them with the true labels
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [None]:
from transformers import Trainer
#trainer intilization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "dryrun"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],  # 👈 disables wandb completely
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print(results)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.011745,0.990762,0.993056,0.991908,0.997466
2,No log,0.010385,0.991908,0.993056,0.992481,0.997647
3,No log,0.008377,0.991926,0.99537,0.993645,0.998009




{'eval_loss': 0.008377345278859138, 'eval_precision': 0.9919261822376009, 'eval_recall': 0.9953703703703703, 'eval_f1': 0.9936452917388793, 'eval_accuracy': 0.9980090497737557, 'eval_runtime': 91.003, 'eval_samples_per_second': 1.879, 'eval_steps_per_second': 0.242, 'epoch': 3.0}


In [None]:
model.save_pretrained("./financial_ner_model")
tokenizer.save_pretrained("./financial_ner_model")


('./financial_ner_model/tokenizer_config.json',
 './financial_ner_model/special_tokens_map.json',
 './financial_ner_model/vocab.txt',
 './financial_ner_model/added_tokens.json',
 './financial_ner_model/tokenizer.json')

In [None]:
!ls financial_ner_model


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [None]:
!zip -r financial_ner_model.zip financial_ner_model
from google.colab import files
files.download("financial_ner_model.zip")


  adding: financial_ner_model/ (stored 0%)
  adding: financial_ner_model/config.json (deflated 50%)
  adding: financial_ner_model/model.safetensors (deflated 7%)
  adding: financial_ner_model/special_tokens_map.json (deflated 42%)
  adding: financial_ner_model/vocab.txt (deflated 49%)
  adding: financial_ner_model/tokenizer.json (deflated 70%)
  adding: financial_ner_model/tokenizer_config.json (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "financial_ner_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
#exrtracted financial entities from texttt
text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
results = ner_pipeline(text)
for r in results:
    print(r)


Device set to use cpu


{'entity_group': 'NUM', 'score': np.float32(0.9518276), 'word': '15', 'start': 22, 'end': 24}
{'entity_group': 'NUM', 'score': np.float32(0.9921509), 'word': '3', 'start': 83, 'end': 84}


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "./financial_ner_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

print("✅ Model loaded successfully!")


Device set to use cpu


✅ Model loaded successfully!


In [None]:
def extract_user_entities(text, user_entities):
    """
    Extract entities that match user-specified financial terms.
    """
    results = ner_pipeline(text)
    extracted = []
    for entity in results:
        for target in user_entities:
            if target.lower() in entity["word"].lower() or target.lower() in text.lower():
                extracted.append({
                    "Entity": entity["word"],
                    "Label": entity["entity_group"],
                    "Score": round(entity["score"], 3)
                })
    return extracted


In [None]:
def extract_financial_events(text):
    """
    Detect events like M&A, IPO, earnings call, stock split, etc.
    """
    events = []
    keywords = {
        "merger": "Merger/Acquisition",
        "acquisition": "Merger/Acquisition",
        "ipo": "Initial Public Offering",
        "earnings call": "Earnings Call",
        "stock split": "Stock Split",
        "dividend": "Dividend Announcement"
    }

    for key, value in keywords.items():
        if key in text.lower():
            events.append({"Event": value, "Keyword": key})

    return events


In [None]:
!pip install yfinance -q
import yfinance as yf

def get_financial_data(ticker):
    stock = yf.Ticker(ticker)
    info = stock.info
    data = {
        "Company": info.get("longName"),
        "Current Price": info.get("currentPrice"),
        "Market Cap": info.get("marketCap"),
        "EPS": info.get("trailingEps"),
        "Revenue (TTM)": info.get("totalRevenue")
    }
    return data


In [None]:
def full_financial_analysis(text, user_entities, ticker=None):
    print("🔹 Extracting user-defined entities...")
    entities = extract_user_entities(text, user_entities)

    print("🔹 Extracting financial events...")
    events = extract_financial_events(text)

    print("🔹 Fetching company financials...")
    data = get_financial_data(ticker) if ticker else None

    return {
        "Extracted Entities": entities,
        "Detected Events": events,
        "Company Financials": data
    }


In [None]:
import pandas as pd

# Example usage:
sample_text = "Apple Inc. announced a new product line. Alphabet Corp. also reported strong earnings."
user_entities = ["Apple Inc.", "Alphabet Corp."]
analysis_results = full_financial_analysis(sample_text, user_entities, ticker="AAPL")

# Now you can access the dictionary keys from analysis_results
pd.DataFrame(analysis_results["Extracted Entities"]).to_csv("user_entities.csv", index=False)
pd.DataFrame(analysis_results["Detected Events"]).to_csv("events.csv", index=False)

print("Analysis results saved to user_entities.csv and events.csv")

🔹 Extracting user-defined entities...
🔹 Extracting financial events...
🔹 Fetching company financials...
Analysis results saved to user_entities.csv and events.csv


In [None]:
#4

In [None]:
from google.colab import files
uploaded = files.upload()


Saving RIL-Integrated-Annual-Report-2024-25.pdf to RIL-Integrated-Annual-Report-2024-25.pdf


In [None]:
!pip install pdfplumber
import pdfplumber

pdf_path = list(uploaded.keys())[0]

text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text() or ""

# Save as text
with open("annual_report.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("✅ Text extracted successfully! Sample:")
print(text[:1000])

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import re

def segment_financial_report(text):
    sections = {
        "Management Discussion and Analysis": re.findall(r"(Management.?s Discussion and Analysis.*?)(?=\n[A-Z])", text, re.DOTALL | re.IGNORECASE),
        "Risk Factors": re.findall(r"(Risk Factors.*?)(?=\n[A-Z])", text, re.DOTALL | re.IGNORECASE),
        "Financial Statements": re.findall(r"(Financial Statements.*?)(?=\n[A-Z])", text, re.DOTALL | re.IGNORECASE),
        "Cash Flow": re.findall(r"(Cash Flow.*?)(?=\n[A-Z])", text, re.DOTALL | re.IGNORECASE),
    }
    return {k: v[0] if v else "" for k, v in sections.items()}

with open("annual_report.txt", "r", encoding="utf-8") as f:
    report_text = f.read()

sections = segment_financial_report(report_text)

for name, content in sections.items():
    print(f"\n📄 Section: {name}")
    print(content[:800], "\n---")



📄 Section: Management Discussion and Analysis
 
---

📄 Section: Risk Factors
 
---

📄 Section: Financial Statements
Financial Statements 
---

📄 Section: Cash Flow
cash flows, enabling it to fund strategic 
---


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "./financial_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example: Run NER on Management Discussion
mdna_text = sections["Management Discussion and Analysis"]
entities = ner_pipeline(mdna_text[:1000])

print("📊 Extracted Financial Entities:")
for e in entities:
    print(e)


Device set to use cpu


📊 Extracted Financial Entities:


In [None]:
import pandas as pd

entity_df = pd.DataFrame(entities)
entity_df.to_csv("extracted_entities.csv", index=False)

print("✅ Entities saved to extracted_entities.csv")
display(entity_df.head())


✅ Entities saved to extracted_entities.csv


In [None]:
#----testing____

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "./financial_ner_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

print("✅ Model loaded successfully")


Device set to use cpu


✅ Model loaded successfully


In [None]:
sample_text = """
In 2024, Reliance Industries reported a revenue growth of 12%,
with total earnings per share (EPS) rising to ₹52.4. The company’s market
capitalization reached $230 billion following its acquisition of Future Group.
"""
#Test on a Financial Text Snippet
results = ner_pipeline(sample_text)
for r in results:
    print(r)


{'entity_group': 'NUM', 'score': np.float32(0.925131), 'word': '202', 'start': 4, 'end': 7}
{'entity_group': 'NUM', 'score': np.float32(0.8916565), 'word': '##4', 'start': 7, 'end': 8}
{'entity_group': 'NUM', 'score': np.float32(0.9821921), 'word': '12', 'start': 59, 'end': 61}
{'entity_group': 'NUM', 'score': np.float32(0.99207336), 'word': '##5', 'start': 111, 'end': 112}
{'entity_group': 'NUM', 'score': np.float32(0.99397624), 'word': '##2', 'start': 112, 'end': 113}
{'entity_group': 'NUM', 'score': np.float32(0.96228975), 'word': '4', 'start': 114, 'end': 115}
{'entity_group': 'NUM', 'score': np.float32(0.99615365), 'word': '230', 'start': 162, 'end': 165}


In [None]:
import re

def detect_financial_events(text):
    events = []
    if re.search(r'\bacquisition\b|\bmerger\b', text, re.IGNORECASE):
        events.append("Mergers & Acquisitions")
    if re.search(r'\bIPO\b|\bInitial Public Offering\b', text, re.IGNORECASE):
        events.append("IPO Announcement")
    if re.search(r'\bearnings call\b|\bquarterly results\b', text, re.IGNORECASE):
        events.append("Earnings Call")
    return events

events = detect_financial_events(sample_text)
print("Detected Financial Events:", events)


Detected Financial Events: ['Mergers & Acquisitions']


In [None]:
with open("RIL_report_text.txt", "r", encoding="utf-8") as f:
    report_text = f.read()

sample_section = report_text[:2000]
entities = ner_pipeline(sample_section)


FileNotFoundError: [Errno 2] No such file or directory: 'RIL_report_text.txt'

In [None]:
#--test pdf

In [None]:
!pip install PyMuPDF -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import fitz  # PyMuPDF

pdf_path = "RIL-Integrated-Annual-Report-2024-25.pdf"  # your uploaded PDF file
doc = fitz.open(pdf_path)

text = ""
for page in doc:
    text += page.get_text("text")

print("✅ PDF text extracted successfully! Total length:", len(text))


✅ PDF text extracted successfully! Total length: 974851


In [None]:
sample_text = text[:4000]
print(sample_text[:1000])  # see the first few lines


Integrated Annual Report
2024-25
Realising
Aspirations
Accessibility
Reliability
Variety
Mobility
Connectivity
Responsibility
Sustainability
TABLE OF CONTENTS
REPORTING SUITE 
2024-25
RIL’s Annual Reporting suite brings 
together the financial, non-financial, 
risk, and sustainability performance for 
the year.
Corporate Overview
2 
Reliance at a Glance
3 
Stakeholder Value Creation
4 
Chairman and Managing Director’s Statement
6 
10-year Financial Highlights
Management Discussion and Analysis
7 
Financial Performance and Review
	
Business Overview
9 
	 Retail
12 
 Digital Services
15 
 Media and Entertainment
19 
 Oil to Chemicals
22 
 Oil and Gas
25 
Risk and Governance
28 
Major Awards and Recognitions
Integrated Approach to Sustainable Growth
30 
Integrated Approach to ESG Governance
32 
Making Significant Strides towards a
Net Carbon Zero Future
33 
Approach to Climate-related Disclosures
34	
 N  	 Natural Capital
37	
 H  	 Human Capital
38	
 M  	 Manufactured Capital
39	
 I  	 I

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "./financial_ner_model"  # your trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

results = ner_pipeline(sample_text)


Device set to use cpu


In [None]:
import pandas as pd

entity_df = pd.DataFrame(results)
entity_df.to_csv("RIL_extracted_entities.csv", index=False)

print("✅ Entities saved to RIL_extracted_entities.csv")
display(entity_df.head())


✅ Entities saved to RIL_extracted_entities.csv


Unnamed: 0,entity_group,score,word,start,end
0,FORM,0.997327,Report,18,24
1,NUM,0.987071,202,25,28
2,NUM,0.989337,##4,28,29
3,NUM,0.998043,25,30,32
4,NUM,0.987956,202,176,179



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
sample_text = text[:15000]  # first 15,000 characters
results = ner_pipeline(sample_text)


In [None]:
#$$$$$$$$$$$$$$$$$$$$$4$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

In [None]:
!pip install pymupdf pillow pdf2image pytesseract pypdf python-docx


Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pypdf-6.3.0-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [9