# DATA LOADING 

In [1]:
import PyPDF2

def extractPDF(pdf_file):
  with open(pdf_file, 'rb') as pdf_reader:
    reader = PyPDF2.PdfReader(pdf_reader)
    num_pages = len(reader.pages)
    text = ""
    for page_num in range(num_pages):
      page = reader.pages[page_num]
      text += page.extract_text()
    return text


pdf_file = 'Sample Invoices/Invoice_Sample03.pdf'
pdf_text = extractPDF(pdf_file)
print(pdf_text)

Invoice No: 56789
---------------------
Item 1: Insulation Material
- Quantity: 100 rolls
- Unit Price: $60.00
- Total Amount: $6,000.00
Item 2: Drywall Sheets
- Quantity: 150 sheets
- Unit Price: $25.00
- Total Amount: $3,750.00
Item 3: Paint (5-gallon cans)
- Quantity: 20 cans
- Unit Price: $45.00
- Total Amount: $900.00
Subtotal: $10,650.00
Tax (8%): $852.00
Total Invoice Amount: $11,502.00
Invoice Date: July 25, 2024
Vendor: GHI Building Supplies
Address: 9101 Trade St, Buildville, ST 34567
Customer: JKL Construction Co.
Project: Warehouse Expansion
Terms: Payment due by August 25, 2024.


# INITIALIZING LLAMA TO EXTRACT FEATURES FROM THE INVOICE

In [9]:
!pip install -qU langchain-ollama

You should consider upgrading via the 'C:\Users\user\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [2]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
        model="llama3",
        temperature=0,
        # other params...
)

There are a few ways to access LLaMA2.

To run locally, use Ollama.ai. See [here](https://python.langchain.com/docs/integrations/chat/ollama) for details on installation and setup.

To use an external API, which is not private, you can use Replicate. You can register and get your REPLICATE_API_TOKEN [here](https://replicate.com/).

from langchain.llms import Replicate
from langchain.chains import LLMChain
from langchain_core.prompts import BasePromptTemplate

# EXTRACTING FEATURES FROM THE EXTRACTED DATA FORM PDF


In [3]:
def extractTextPdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text()
    return full_text

In [4]:
import os
import pdfplumber
folder_path = 'Sample Invoices'
pdf_text = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        all_extracted_data = extractTextPdf(pdf_path)
        print(all_extracted_data)
        pdf_text.append(all_extracted_data)

------------------------------------------------
| Item Description | Quantity | Unit Price | Total Amount |
------------------------------------------------
| Concrete Mix | 15 | $120.00 | $1,800.00 |
| Brick Masonry | 200 | $0.50 | $100.00 |
| Scaffolding | 10 | $50.00 | $500.00 |
------------------------------------------------
Invoice Date: July 15, 2024
Invoice Number: INV-2024-001
Vendor: ABC Construction Supplies
Address: 1234 Industrial Ave, Suite 100, Cityville, ST 56789
Customer: XYZ Builders Inc.
Project: New Office Building
Terms: Payment due within 30 days from the invoice date.
Project Invoice
----------------------
Description: Hydraulic Cement
Quantity: 25
Unit Price: $90.00
Total Amount: $2,250.00
Description: Electrical Wiring
Quantity: 300 meters
Unit Price: $0.75 per meter
Total Amount: $225.00
Description: Safety Helmets
Quantity: 50
Unit Price: $20.00
Total Amount: $1,000.00
Invoice Date: July 20, 2024
Invoice Number: INV-2024-002
Vendor: DEF Construction Material

# PROMPT ENGINEERING

In [5]:
from langchain_core.messages import AIMessage

prompt =f"""Extract the following information from the given text:
1.Description, 
2.Quantity, 
3.Unit price, 
4.Total amount

Text: 
{pdf_text}

Output the information in the same format without any invoice information like **Invoice 1** **Invoice 1(continued)**:

1.Description: concrete mix
2.Quantity: 15
3.Unit price: $ 120
4.Total amount:$ 1800
"""

messages = [
    ("system", "You are an expert in extracting structured information from unstructured text."),
    ("human", prompt),
]

ai_msg = llm.invoke(messages)


ai_msg_content = ai_msg.content

extracted_info = ai_msg_content.split('Here is the extracted information in the desired format:')[1].strip()
print(extracted_info)




1. Description: Concrete Mix
2. Quantity: 15
3. Unit Price: $120.00
4. Total Amount: $1,800.00

1. Description: Brick Masonry
2. Quantity: 200
3. Unit Price: $0.50
4. Total Amount: $100.00

1. Description: Scaffolding
2. Quantity: 10
3. Unit Price: $50.00
4. Total Amount: $500.00

1. Description: Hydraulic Cement
2. Quantity: 25
3. Unit Price: $90.00
4. Total Amount: $2,250.00

1. Description: Electrical Wiring
2. Quantity: 300 meters
3. Unit Price: $0.75 per meter
4. Total Amount: $225.00

1. Description: Safety Helmets
2. Quantity: 50
3. Unit Price: $20.00
4. Total Amount: $1,000.00

1. Description: Insulation Material
2. Quantity: 100 rolls
3. Unit Price: $60.00
4. Total Amount: $6,000.00

1. Description: Drywall Sheets
2. Quantity: 150 sheets
3. Unit Price: $25.00
4. Total Amount: $3,750.00

1. Description: Paint (5-gallon cans)
2. Quantity: 20 cans
3. Unit Price: $45.00
4. Total Amount: $900.00

1. Description: Plumbing Pipes
2. Quantity: 50
3. Unit Price: $40 each
4. Total Amount

# CLEANING THE DATA AND STRUCTURING

In [11]:
import csv
import re

def parse_item(item_string):
    lines = item_string.strip().split('\n')
    return [
        re.search(r'Description: (.+)', lines[0]).group(1),
        re.search(r'Quantity: (.+)', lines[1]).group(1),
        re.search(r'Unit Price: (.+)', lines[2]).group(1),
        re.search(r'Total Amount: (.+)', lines[3]).group(1)
    ]

items = re.split(r'\n\n(?=1\.)', extracted_info.strip())

parsed_data = [parse_item(item) for item in items]

with open('construction_materials.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Description', 'Quantity', 'Unit Price', 'Total Amount'])
    writer.writerows(parsed_data)

print("Data has been written to construction_materials.csv")

Data has been written to construction_materials.csv


In [27]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.2-py3-none-any.whl.metadata (40 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading cryptography-43.0.0-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Downloading pdfplumber-0.11.2-py3-none-any.whl (58 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 5.6/5.6 MB 56.9 MB/s eta 0:00:00
Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   ---------------------------------------- 2.9/2.9 MB 33.7 MB/s eta 0:00:00
Downloading cryptography-43.0.0-cp39-abi3-win_amd64.w

