In [1]:
!pip install -q  PyPDF2 pandas dask[complete] groq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:

from PyPDF2 import PdfReader
import pandas as pd
import dask.delayed
import dask.distributed
from typing import List, Dict, Optional, Tuple
import groq
import os
import json
import logging
from dataclasses import dataclass
from pathlib import Path
import time
from concurrent.futures import ThreadPoolExecutor
import re
from google.colab import userdata


In [15]:


@dataclass
class BOQItem:
    product_name: str
    specification: str
    quantity: str
    unit: str
    page_number: int = 0

    def validate(self) -> Tuple[bool, List[str]]:
        """Validate BOQ item fields."""
        errors = []
        if not self.product_name.strip():
            errors.append("Empty product name")
        if not self.specification.strip():
            errors.append("Empty specification")
        if not self.quantity.strip():
            errors.append("Empty quantity")
        if not self.unit.strip():
            errors.append("Empty unit")

        try:
            float(self.quantity.replace(',', ''))
        except ValueError:
            errors.append(f"Invalid quantity format: {self.quantity}")

        return len(errors) == 0, errors

    def clean(self) -> None:
        """Clean and normalize fields."""
        self.product_name = self.product_name.strip()
        self.specification = self.specification.strip()
        self.quantity = re.sub(r'[^\d.,]', '', self.quantity)
        self.unit = self.unit.strip().lower()

class BOQExtractor:
    def __init__(self):
        """Initialize with Groq client."""
        # Get first available API key from environment
        api_key = userdata.get('GROQ_API_KEY')
        if not api_key:
            raise ValueError("GROQ_API_KEY not found in environment")

        self.client = groq.Groq(api_key=api_key)

        # Configure logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def extract_text(self, pdf_path: str) -> Dict[int, str]:
        """Extract text from PDF and return dict with page numbers and text."""
        page_texts = {}

        try:
            with open(pdf_path, 'rb') as file:
                pdf = PdfReader(file)
                for i, page in enumerate(pdf.pages, start=1):
                    text = page.extract_text()
                    if text.strip():
                        cleaned_text = " ".join(line.strip() for line in text.splitlines())
                        page_texts[i] = cleaned_text
        except Exception as e:
            self.logger.error(f"Error processing PDF: {e}")
            raise

        return page_texts

    def extract_fields(self, chunk: str) -> List[BOQItem]:
        """Extract BOQ fields with validation and cleaning."""
        prompt = """You are a precise extractor that only outputs valid JSON arrays. Extract these fields from the text:
        1. Product Name
        2. Product Specification with Description
        3. Quantity
        4. Unit of Measurement

        Only output a valid JSON array like this, with no additional text:
        [
            {
                "product_name": "example name",
                "specification": "detailed spec",
                "quantity": "numeric value",
                "unit": "unit of measurement"
            }
        ]

        If no items are found, output an empty array: []"""

        try:
            response = self.client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": chunk}
                ],
                temperature=0.2,  # Lower temperature for more consistent output
                max_tokens=1000
            )

            content = response.choices[0].message.content

            # Debug logging
            self.logger.debug(f"Raw LLM response: {content}")

            # Clean the content to ensure it's valid JSON
            content = content.strip()
            # Remove any markdown code block markers if present
            content = content.replace('```json', '').replace('```', '')

            try:
                items = json.loads(content) if content else []
            except json.JSONDecodeError as e:
                self.logger.error(f"JSON decode error: {e}")
                self.logger.error(f"Problematic content: {content}")
                return []

            if not isinstance(items, list):
                self.logger.error(f"Expected list but got {type(items)}")
                return []

            boq_items = []
            for item in items:
                try:
                    boq_item = BOQItem(**item)
                    boq_item.clean()
                    is_valid, errors = boq_item.validate()
                    if is_valid:
                        boq_items.append(boq_item)

                except Exception as e:
                    self.logger.warning(f"Error creating BOQ item: {e}")
                    continue

            return boq_items

        except Exception as e:
            self.logger.error(f"Error extracting fields: {e}")
            return []

    def process_document(self, pdf_path: str, output_path: str) -> None:
        """Process document and save to Excel."""
        try:
            pdf_path = Path(pdf_path)
            if not pdf_path.exists():
                raise FileNotFoundError(f"PDF file not found: {pdf_path}")

            page_texts = self.extract_text(pdf_path)
            all_items = []

            for page_num, text in page_texts.items():
                self.logger.info(f"Processing page {page_num}...")
                items = self.extract_fields(text)
                if items:  # Only log if items were found
                    self.logger.info(f"Found {len(items)} items on page {page_num}")
                for item in items:
                    item.page_number = page_num
                all_items.extend(items)

            if not all_items:
                self.logger.error("No valid BOQ items extracted from any page")
                raise ValueError("No valid BOQ items extracted")

            df = pd.DataFrame([
                {
                    'Product Name': item.product_name,
                    'Product Specification': item.specification,
                    'Quantity': item.quantity,
                    'Unit': item.unit,
                    'Page Number': item.page_number
                }
                for item in all_items
            ])

            df.to_excel(output_path, index=False)
            self.logger.info(f"Saved {len(df)} items to {output_path}")

        except Exception as e:
            self.logger.error(f"Processing failed: {str(e)}")
            raise



In [14]:
def main():
    try:
        extractor = BOQExtractor()
        pdf_path = "/content/tender_601813909e29b_TenderNitPPbag.pdf"
        output_path = "extracted_boq.xlsx"

        extractor.process_document(pdf_path, output_path)

    except Exception as e:
        logging.error(f"Processing failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()



In [None]:
def extract_text( pdf_path: str) -> str:
          """Extract text from a single PDF and return as a single string with error handling."""
          all_text = []



          try:
              with open(pdf_path, 'rb') as file:
                  pdf = PdfReader(file)
                  for i, page in enumerate(pdf.pages):
                      text = page.extract_text()
                      if text.strip():  # Only include non-empty pages
                          # Clean unnecessary newlines
                          cleaned_text = " ".join(line.strip() for line in text.splitlines())
                          all_text.append(cleaned_text)
          except Exception as e:

              raise ValueError(f"Failed to process PDF at {pdf_path}")

          if not all_text:
              raise ValueError("No text could be extracted from the PDF")

          return " ".join(all_text)

def create_chunks( text: str, chunk_size: int = 8000, overlap: int = 20) -> List[str]:
        """Split text into chunks with overlap and table preservation."""
        if not text.strip():
            raise ValueError("Empty text provided for chunking")

        lines = text.split('\n')
        chunks = []
        current_chunk = []
        current_size = 0

        for line in lines:
            line_size = len(line.split())

            # Detect potential table rows
            is_table_row = bool(re.search(r'\d+\s*[x×]\s*\d+|[\d.,]+\s*(pcs|m2|m3|kg|tons?)', line, re.I))

            # If the chunk size exceeds and it's not a table row, create a new chunk
            if current_size + line_size > chunk_size and current_chunk and not is_table_row:
                # Add overlap from the previous chunk
                overlap_lines = current_chunk[-overlap:] if overlap > 0 else []
                chunks.append('\n'.join(current_chunk))
                current_chunk = overlap_lines.copy()
                current_size = sum(len(l.split()) for l in overlap_lines)

            # Add the current line to the chunk
            current_chunk.append(line)
            current_size += line_size

        # Append the last chunk
        if current_chunk:
            chunks.append('\n'.join(current_chunk))

        return chunks

In [None]:
pdf_path = "/content/tender_601813909e29b_TenderNitPPbag.pdf"
result=extract_text(pdf_path)
chunks=create_chunks(result)

In [None]:
chunks

['CEMENT CORPORATION OF INDIA LIMITED (A GOVERNMENT OF INDIA ENTERPRISE) RAJBAN CEMENT FACTORY(ISO 9001:2015Certified Company) Tehsil Paonta Sahib Distt. Sirmour – (HP)-173028 Tel: (01704)266223 / Fax 266270 Email: rajban@cciltd.in CIN-U74899DL1965GOI004322 Website: www.cciltd.in NIT No.:RCF/MM /PP Bag/2021 Dated: 01.02.2021 NOTICE INVITING E-TENDER CUM REVERSE AUCTION NOTICE (Only through e-procurement)  Online electronic bids through Electronic Tendering System (ETS) are inviting from the parties of reputed who are interested in tender for Supply of LAMINATED POLYPROPYLENE (PP) BAGS. The complete set of tender document is available on websites: www.cciltd.in, and www.cci-etender.com of Antares Systems Ltd.  E-TENDER NO. RCF/MM /PP Bag/2021 MODE OF TENDER e-Procurement System (Online Part A - Techno- Commercial Bid and Part B - Price Bid) through www.cci-etender.com of Antares Systems Ltd Date of NIT available to parties to download From 02.02.2021 (10.00 hrs.) Till 23.02.2021 (16.00 

In [None]:
import os
from groq import Groq

# Get API key from environment variable
GROQ_API_KEY = GROQ_API_KEY

# Raise an error if the API key is not found
if GROQ_API_KEY is None:
    raise ValueError("GROQ_API_KEY environment variable is not set.")

client = Groq(GROQ_API_KEY)
completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[],
    temperature=1,
    max_completion_tokens=1024,
    top_p=1,
    stream=True,
    stop=None,
)

for chunk in completion:
    print(chunk.choices[0].delta.content or "", end="")

ValueError: GROQ_API_KEY environment variable is not set.