<a href="https://colab.research.google.com/github/kaustubhgbu/Evaluating-Nutritional-Density-of-Foods-to-Enhance-Dietary-Health-and-Well-Being/blob/main/Invoice_Data_Extraction_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Install the required packages
!pip install pymupdf
!pip install -q -U google-generativeai
!pip install pytesseract
!apt-get install tesseract-ocr

# Step 2: Importing the libraries we need
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display, Markdown
from google.colab import files
import fitz  # PyMuPDF for working with PDFs
import re
from PIL import Image
import pytesseract  # OCR to read text from images

# Step 3: A function to turn text into Markdown format
def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Step 4: Getting the API key securely and setting up the Gemini API
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

# Setting up the Gemini API with our API key
genai.configure(api_key=GOOGLE_API_KEY)

# Step 5: Asking the user to upload an invoice file (PDF or image)
print("Please upload your invoice PDF or image:")
uploaded = files.upload()

# Getting the name of the uploaded file
file_path = list(uploaded.keys())[0]

# Step 6: A function to pull text out of the uploaded PDF or image
def extract_text(file_path):
    # Checking the file type by its extension
    file_ext = pathlib.Path(file_path).suffix.lower()

    if file_ext == '.pdf':
        # If it's a PDF, extract text from each page
        with fitz.open(file_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
    elif file_ext in ['.jpg', '.jpeg', '.png']:
        # If it's an image, use OCR to extract text
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    else:
        # If the file type isn't supported, show an error
        raise ValueError("Unsupported file type. Please upload a PDF or an image file.")

    return text

# Extracting the text from the uploaded file
extracted_text = extract_text(file_path)

# Step 7: A function to use the Gemini API to find details in the invoice text
def extract_invoice_details_with_gemini(text):
    prompt = f"""
    The following text is from an invoice. Please identify and extract the relevant information,
    including:
    - Customer Details (such as name, address, and contact information)
    - A list of product names only (ignore quantities, weights, HSN codes, prices, etc.)
    - The total amount

    Invoice Text:
    {text}

    Extracted Information:
    - Customer Details:
    - Products (Names Only):
    - Total Amount:
    """

    # Sending the text to the Gemini API and getting the results
    response = genai.generate_text(prompt=prompt, max_output_tokens=500)

    return response.result

# Extracting the invoice details using the Gemini API
invoice_details = extract_invoice_details_with_gemini(extracted_text)

# Step 8: Showing the extracted details in Markdown format
display(to_markdown(f"**Extracted Details:**\n{invoice_details}"))


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Please upload your invoice PDF or image:


Saving Sample Invoice image.png to Sample Invoice image.png


> **Extracted Details:**
> Customer Details:
>     Test
>     Hyderabad, TELANGANA, 500089
>     Ph: 9108239284
>     test@gmail.com
> 
>     - Products (Names Only):
>     WASTE AND SCRAP OF STAINLESS STEEL
> 
>     - Total Amount:
>     7,68,771.00