# LOADING DATA OF THE PDF USING PYPDF OR PDFPLUMBER

In [1]:
import PyPDF2

def extractPDF(pdf_file):
  with open(pdf_file, 'rb') as pdf_reader:
    reader = PyPDF2.PdfReader(pdf_reader)
    num_pages = len(reader.pages)
    text = ""
    for page_num in range(num_pages):
      page = reader.pages[page_num]
      text += page.extract_text()
    return text


pdf_file = 'Sample.pdf'
pdf_text = extractPDF(pdf_file)
print(pdf_text)

In recent developments, the Urban Park Renovation project has been finalized with LMN Builders 
and the City Council. The contract will kick off on September 1, 2024, and will run through March 
1, 2025. This contract, valued at $1,800,000, includes a detailed plan for overhauling the existing 
park facilities.
The scope includes the demolition of outdated structures, excavation work, and the construction of 
new recreational facilities. The project aims to rejuvenate the park and make it more accessible and 
enjoyable for the community. The agreement outlines specific responsibilities for both parties and 
includes timelines and quality benchmarks to ensure successful delivery.


# USING OLLAMA LLM TO SORT OUT THE DATA REQUIRED FROM THE UNSTRUCTURED PDF DATA

In [9]:
!pip install -qU langchain-ollama

You should consider upgrading via the 'C:\Users\user\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [2]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
        model="llama3",
        temperature=0,
        # other params...
)

# SAMPLE FOR 1 INVOICE

In [3]:
from langchain_core.messages import AIMessage




# I AHVE DEFINED THE PROMPT TO BE OUTPUT THE NEEDED INFORMATION 



prompt = f"""
Extract the following information from the given text:
1. Project Name
2. Start Date
3. End Date
4. Parties Involved
5. Contract Value

Text: 
{pdf_text}

Output the information as follows:
- Project Name:
- Start Date:
- End Date:
- Parties Involved:
- Contract Value:
"""


messages = [
    ("system", "You are an expert in extracting structured information from unstructured text."),
    ("human", prompt),
]

ai_msg = llm.invoke(messages)

# response FROM AI
#print(ai_msg.content)

ai_msg_content = ai_msg.content
extracted_info = ai_msg_content.split('Here is the extracted information:')[1].strip()

print(extracted_info)



- Project Name: Urban Park Renovation
- Start Date: September 1, 2024
- End Date: March 1, 2025
- Parties Involved: LMN Builders and the City Council
- Contract Value: $1,800,000


# FOR ALL THE PDF INVOICE

In [20]:
import os
import csv
import pdfplumber
from langchain_core.messages import AIMessage
 
folder_path = 'Sample Contracts'
    
all_extracted_data = []

def extractTextPdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text()
    return full_text


def inferenceLlm(text):
    prompt = f"""
    Extract the following information from the given text:
    1. Project Name
    2. Start Date
    3. End Date
    4. Parties Involved
    5. Contract Value

    Text: 
    {text}

    Output the information as follows:
    - Project Name:
    - Start Date:
    - End Date:
    - Parties Involved:
    - Contract Value:
    """
    
    messages = [
        ("system", "You are an expert in extracting structured information from unstructured text."),
        ("human", prompt),
    ]
    
    ai_msg = llm.invoke(messages)
    ai_msg_content = ai_msg.content
    print(ai_msg_content)
    

    ai_msg_content = ai_msg_content.split('Here is the extracted information:')[1].strip()
    
    extracted_info = {}
    for line in ai_msg_content.splitlines():
        if ':' in line:
            key, value = line.split(':', 1)
            extracted_info[key.strip()] = value.strip()
    #print(extracted_info)
    return extracted_info



for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        pdf_text = extractTextPdf(pdf_path)
        #print(pdf_text)
        extracted_info = inferenceLlm(pdf_text)
        #print(extracted_info)
        all_extracted_data.append(extracted_info)

csv_file = "extracted_invoices_info.csv"
with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=all_extracted_data[0].keys())
    writer.writeheader()
    writer.writerows(all_extracted_data)
print(f"Structured information has been saved to {csv_file}.")


Here is the extracted information:

- Project Name: Riverside Residential Project
- Start Date: September 10, 2024
- End Date: June 30, 2025
- Parties Involved: Riverside Constructions Ltd. and DEF Housing Corp.
- Contract Value: $2,200,000
Here is the extracted information:

- Project Name: Greenfield Development
- Start Date: August 15, 2024
- End Date: December 15, 2024
- Parties Involved: ABC Construction Company and XYZ Developers
- Contract Value: $2,500,000
Here is the extracted information:

- Project Name: Urban Park Renovation project
- Start Date: September 1, 2024
- End Date: March 1, 2025
- Parties Involved: LMN Builders and the City Council
- Contract Value: $1,800,000
Here is the extracted information:

- Project Name: Downtown Office Tower
- Start Date: November 1, 2024
- End Date: October 1, 2025
- Parties Involved: Downtown Construction Inc. and ABC Realty
- Contract Value: $4,200,000
Here is the extracted information:

- Project Name: Lakeview Community Center
- Star

In [None]:
type(all_extracted_data)

In [None]:
!pip install pdfplumber