In [None]:
!ls -lh /content/financial_ner_model || ls -lh .
# also check safetensors size
import os
p = "/content/financial_ner_model/model.safetensors"
if not os.path.exists(p):
    p = "./model.safetensors"
print("Model file:", p)
if os.path.exists(p):
    print("Size:", os.path.getsize(p)/1024/1024, "MB")
else:
    print("No safetensors found in expected paths.")


ls: cannot access '/content/financial_ner_model': No such file or directory
total 94M
-rw-r--r-- 1 root root  833 Nov  6 12:13 config.json
-rw-r--r-- 1 root root  93M Nov  6 12:14 model.safetensors
drwxr-xr-x 1 root root 4.0K Nov  4 14:36 sample_data
-rw-r--r-- 1 root root  125 Nov  6 12:13 special_tokens_map.json
-rw-r--r-- 1 root root 1.2K Nov  6 12:13 tokenizer_config.json
-rw-r--r-- 1 root root 654K Nov  6 12:13 tokenizer.json
-rw-r--r-- 1 root root 209K Nov  6 12:13 vocab.txt
Model file: ./model.safetensors
Size: 93.0 MB


In [None]:
!pip install -q transformers datasets tokenizers yfinance


In [None]:
!ls -lh model.safetensors
!md5sum model.safetensors


-rw-r--r-- 1 root root 331M Nov  6 12:17 model.safetensors
83a426961e4a0a824e648ace9a8269bf  model.safetensors


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import os, sys, warnings

MODEL_PATH = "."

print("Files in model folder:")
print(os.listdir(MODEL_PATH))

use_safetensors = os.path.exists(os.path.join(MODEL_PATH, "model.safetensors"))
print("Use safetensors?", use_safetensors)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, use_safetensors=use_safetensors)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    print("‚úÖ Model and pipeline loaded successfully.")
except Exception as e:
    print("‚ùå Model load failed:", e)
    raise


Files in model folder:
['.config', '.locks', 'models--ProsusAI--finbert', 'tokenizer_config.json', 'vocab.txt', 'special_tokens_map.json', 'config.json', 'model.safetensors', 'tokenizer.json', 'dslim_bert-base-NER', 'sample_data']
Use safetensors? True


Device set to use cpu


‚úÖ Model and pipeline loaded successfully.


In [None]:
text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
print("Input:", text)
print("NER output:")
print(ner_pipeline(text))


Input: Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion.
NER output:
[{'entity_group': 'NUM', 'score': np.float32(0.9366002), 'word': '15', 'start': 22, 'end': 24}, {'entity_group': 'NUM', 'score': np.float32(0.9855517), 'word': '3', 'start': 83, 'end': 84}]


In [None]:
def extract_user_entities(text, user_entities):
    results = ner_pipeline(text)
    extracted = []
    for entity in results:
        for target in user_entities:
            # match either detected token or a direct substring (for company names)
            if target.lower() in entity["word"].lower() or target.lower() in text.lower():
                extracted.append({
                    "Entity": entity["word"],
                    "Label": entity["entity_group"],
                    "Score": float(round(entity["score"], 4))
                })
    return extracted

def extract_financial_events(text):
    events = []
    keywords = {
        "merger": "Merger/Acquisition",
        "acquisition": "Merger/Acquisition",
        "ipo": "Initial Public Offering",
        "earnings call": "Earnings Call",
        "stock split": "Stock Split",
        "dividend": "Dividend Announcement"
    }
    low = text.lower()
    for key, value in keywords.items():
        if key in low:
            events.append({"Event": value, "Keyword": key})
    return events


In [None]:
import yfinance as yf

def get_financial_data(ticker):
    if not ticker:
        return None
    try:
        stock = yf.Ticker(ticker)
        info = stock.info
        return {
            "Company": info.get("longName") or info.get("shortName"),
            "Current Price": info.get("currentPrice"),
            "Market Cap": info.get("marketCap"),
            "EPS": info.get("trailingEps"),
            "Revenue (TTM)": info.get("totalRevenue")
        }
    except Exception as e:
        return {"error": str(e)}


In [None]:
def full_financial_analysis(text, user_entities, ticker=None):
    print("üîπ Extracting user-defined entities...")
    entities = extract_user_entities(text, user_entities)

    print("üîπ Extracting financial events...")
    events = extract_financial_events(text)

    print("üîπ Fetching company financials...")
    data = get_financial_data(ticker) if ticker else None

    return {
        "Extracted Entities": entities,
        "Detected Events": events,
        "Company Financials": data
    }


In [None]:
import pandas as pd

sample_text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
user_entities = ["Apple Inc.", "revenue", "EPS", "market cap"]
ticker = "AAPL"

results = full_financial_analysis(sample_text, user_entities, ticker)

print("\n=== Extracted Entities ===")
print(results["Extracted Entities"])
print("\n=== Detected Events ===")
print(results["Detected Events"])
print("\n=== Company Financials ===")
print(results["Company Financials"])

pd.DataFrame(results["Extracted Entities"]).to_csv("user_entities.csv", index=False)
pd.DataFrame(results["Detected Events"]).to_csv("events.csv", index=False)
print("\nSaved user_entities.csv and events.csv to workspace. Use file panel to download.")


üîπ Extracting user-defined entities...
üîπ Extracting financial events...
üîπ Fetching company financials...

=== Extracted Entities ===
[{'Entity': '15', 'Label': 'NUM', 'Score': 0.9366000294685364}, {'Entity': '15', 'Label': 'NUM', 'Score': 0.9366000294685364}, {'Entity': '3', 'Label': 'NUM', 'Score': 0.9855999946594238}, {'Entity': '3', 'Label': 'NUM', 'Score': 0.9855999946594238}]

=== Detected Events ===
[{'Event': 'Merger/Acquisition', 'Keyword': 'acquisition'}]

=== Company Financials ===
{'Company': 'Apple Inc.', 'Current Price': 270.14, 'Market Cap': 3991684251648, 'EPS': 7.46, 'Revenue (TTM)': 416161005568}

Saved user_entities.csv and events.csv to workspace. Use file panel to download.


In [None]:
from IPython.display import display
display(pd.read_csv("user_entities.csv"))
display(pd.read_csv("events.csv"))


Unnamed: 0,Entity,Label,Score
0,15,NUM,0.9366
1,15,NUM,0.9366
2,3,NUM,0.9856
3,3,NUM,0.9856


Unnamed: 0,Event,Keyword
0,Merger/Acquisition,acquisition


In [None]:
#slm

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

sent_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

sentiment_pipeline = pipeline("sentiment-analysis", model=sent_model, tokenizer=sent_tokenizer)

# Analyze sentiment of your extracted text
sample_text = "Apple Inc. reported a 15% revenue increase and announced acquisition of Beats for $3 billion."
sentiment = sentiment_pipeline(sample_text)

print("Financial Sentiment:", sentiment)




config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Financial Sentiment: [{'label': 'positive', 'score': 0.9544737339019775}]


In [None]:
text = "Apple Inc. reported a 15% revenue increase and announced the acquisition of Beats for $3 billion."
user_entities = ["Apple Inc.", "Beats", "revenue", "acquisition"]
analysis_results = full_financial_analysis(text, user_entities, ticker="AAPL")
#mnaual test
from pprint import pprint
pprint(analysis_results)


üîπ Extracting user-defined entities...
üîπ Extracting financial events...
üîπ Fetching company financials...
{'Company Financials': {'Company': 'Apple Inc.',
                        'Current Price': 270.14,
                        'EPS': 7.46,
                        'Market Cap': 3991684251648,
                        'Revenue (TTM)': 416161005568},
 'Detected Events': [{'Event': 'Merger/Acquisition', 'Keyword': 'acquisition'}],
 'Extracted Entities': [{'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9277999997138977},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9277999997138977},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9277999997138977},
                        {'Entity': '15',
                         'Label': 'NUM',
                         'Score': 0.9277999997138977},
              

In [None]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m67.9/67.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-

In [None]:
#4#

In [None]:
!pip install pymupdf pytesseract pdf2image opencv-python pillow transformers yfinance
!sudo apt-get install tesseract-ocr

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pymupdf, pdf2image
Successfully installed pdf2image-1.17.0 pymupdf-1.26.6 pytesseract-0.3.13


In [None]:
!pip install google-generativeai




In [None]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6


In [None]:
import fitz  # PyMuPDF

pdf_path = "/content/00001140361-09-008205.pdf"
doc = fitz.open(pdf_path)

print("Total pages:", len(doc))


Total pages: 229


In [None]:
def extract_text(page):
    return page.get_text("text")

all_pages_text = [extract_text(doc[i]) for i in range(len(doc))]


In [None]:
def extract_tables(page):
    # Use page.find_tables() to identify tables and then extract their data
    extracted_tables_data = []
    tables = page.find_tables()
    for table in tables:
        extracted_tables_data.append(table.extract())
    return extracted_tables_data

all_tables = [extract_tables(doc[i]) for i in range(len(doc))]

print("Successfully extracted tables from all pages.")
# Example: print the first table found on the first page, if any
if all_tables and all_tables[0]:
    print("First table on first page:")
    print(all_tables[0][0])
else:
    print("No tables found on the first page or in the document.")

Consider using the pymupdf_layout package for a greatly improved page layout analysis.
Successfully extracted tables from all pages.
No tables found on the first page or in the document.


In [None]:
import os
from PIL import Image
import io

os.makedirs("pdf_images", exist_ok=True)

def extract_images(page_number, page):
    images = []
    for img_index, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        if pix.n < 5:
            img_bytes = pix.tobytes("png")
        else:
            pix = fitz.Pixmap(fitz.csRGB, pix)
            img_bytes = pix.tobytes("png")

        image_path = f"pdf_images/page{page_number}_img{img_index}.png"
        with open(image_path, "wb") as f:
            f.write(img_bytes)
        images.append(image_path)
    return images

all_images = [extract_images(i, doc[i]) for i in range(len(doc))]
