<a href="https://colab.research.google.com/github/sris321/PDF-Extraction/blob/main/Financial_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
pip install pytesseract



In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
pip install pdf2image



In [13]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (216 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 124947 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...


In [14]:
!apt-get install tesseract-ocr
!apt-get install libtesseract-dev


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,845 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 124977 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [15]:
!pip install python-dotenv



In [25]:
import pytesseract
from PIL import Image
import os
import json
import logging
from pdf2image import convert_from_path
from google import genai

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Mount Google Drive (if using Google Colab)
from google.colab import drive
from dotenv import load_dotenv

drive.mount('/content/drive')  # Connecting with Google account
load_dotenv('/content/drive/My Drive/.env')
api_key = os.getenv("GEMINI_API_KEY")

def pdf_to_images(pdf_path):
    """Converts a PDF file into images in memory."""
    images = convert_from_path(pdf_path)
    logging.info(f"Converted PDF to {len(images)} images in memory.")
    return images

def generate_text(prompt):
    """Generates text from the Google Gemini API."""
    client = genai.Client(api_key=api_key)

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
    )
    if response:
        logging.info("Successfully generated response from Gemini API.")
        return response.text
    else:
        logging.warning("Empty response from Gemini API.")
    return "{}"

def extract_financial_data(pdf_file_path):
    logging.info(f"Starting extraction process for {pdf_file_path}")
    pdf_name = os.path.basename(pdf_file_path).replace('.pdf', '')
    images = pdf_to_images(pdf_file_path)
    extracted_json = {}

    for i, image in enumerate(images):
        logging.info(f"Processing page {i+1}")
        all_text = pytesseract.image_to_string(image)

        filtered_text = ""
        if all_text and ("STATEMENT OF STANDALONE" in all_text.upper() or "STATEMENT OF CONSOLIDATED" in all_text.upper() or "STATEMENT OF UNAUDITED FINANCIAL" in all_text.upper()):
            filtered_text = all_text  # Directly using extracted text
            logging.info(f"Relevant text found in page {i+1}")

            prompt = f"""
            Extract only standalone and consolidated financial statements from the following text.
            Return a **valid JSON** with all the mentioned parameters for all the mentioned dates.
            DO NOT return extra text—only return JSON format.

            ### EXTRACTED TEXT FROM RELEVANT PAGES:
            {filtered_text}
            """
            json_data = generate_text(prompt)
            json_data = json_data.replace('```json', '').replace('```', '')

            try:
                extracted_json[f"page_{i+1}"] = json.loads(json_data)
                logging.info(f"Successfully parsed JSON data for page {i+1}")
            except json.JSONDecodeError:
                logging.error(f"Error decoding JSON for page {i+1}")

        else:
            logging.info(f"No relevant financial data found in page {i+1}")

    # Ensure the output directory exists
    output_directory = "/content/output"
    os.makedirs(output_directory, exist_ok=True)

    output_json_path = os.path.join(output_directory, f"{pdf_name}_extracted_financials.json")
    with open(output_json_path, "w") as f:
        json.dump(extracted_json, f, indent=4)

    logging.info(f"Extracted financial data saved to {output_json_path}")
    return output_json_path

def inference(json_path):
    """Loads and prints financial data from the extracted JSON."""
    if not os.path.exists(json_path):
        logging.error("Error: JSON file not found.")
        return

    with open(json_path, "r") as f:
        data = json.load(f)

    logging.info("Extracted Financial Data:")
    for key, value in data.items():
        logging.info(f"\nPage: {key}\n{json.dumps(value, indent=4)}")

# Run extraction and inference
pdf_path = "/content/data30.pdf"
json_output = extract_financial_data(pdf_path)
inference(json_output)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
