In [33]:
import os, sys, platform, glob, json

DATA_DIR = "data"
OUTPUT_DIR = "outputs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("CWD:", os.getcwd())
print("data/ exists:", os.path.isdir(DATA_DIR))
print("outputs/ exists:", os.path.isdir(OUTPUT_DIR))


Python: 3.12.7
Platform: Windows-11-10.0.26200-SP0
CWD: C:\Users\samat\Desktop\Emplay_Project_llm
data/ exists: True
outputs/ exists: True


In [34]:
!pip install -q python-dotenv google-generativeai pypdf python-docx beautifulsoup4 lxml html2text


In [35]:
from dotenv import load_dotenv
import google.generativeai as genai

load_dotenv()  # reads .env in the current folder

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GEMINI_MODEL   = os.getenv("GEMINI_MODEL", "models/gemini-2.0-flash")

if not GOOGLE_API_KEY:
    raise ValueError("❌ GOOGLE_API_KEY missing. Create a .env with GOOGLE_API_KEY and GEMINI_MODEL.")

genai.configure(api_key=GOOGLE_API_KEY)

generation_config = {
    "temperature": 0.1,
    "top_p": 0.9,
    "top_k": 40,
    "response_mime_type": "application/json"
}

model = genai.GenerativeModel(GEMINI_MODEL, generation_config=generation_config)

print("✅ Loaded key from .env")
print("✅ Gemini model ready:", GEMINI_MODEL)


✅ Loaded key from .env
✅ Gemini model ready: models/gemini-2.0-flash


In [36]:
resp = model.generate_content("Return exactly this text: READY")
print(resp.text.strip())


"READY"


In [37]:
from typing import Tuple
from bs4 import BeautifulSoup
import html2text
from docx import Document
from pypdf import PdfReader

def read_file_as_text(path: str) -> Tuple[str, str]:
    ext = os.path.splitext(path)[1].lower()
    try:
        if ext == ".pdf":
            reader = PdfReader(path)
            pages = []
            for page in reader.pages:
                pages.append(page.extract_text() or "")
            return "pdf", "\n".join(pages)
        elif ext == ".docx":
            doc = Document(path)
            return "docx", "\n".join(p.text for p in doc.paragraphs)
        elif ext in (".html", ".htm"):
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                soup = BeautifulSoup(f, "lxml")
            md = html2text.html2text(str(soup))
            return "html", md
        else:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                return "text", f.read()
    except Exception as e:
        return ext.strip("."), f"[Error reading file: {e}]"

print("✅ Readers ready.")


✅ Readers ready.


In [38]:
STANDARD_OUTPUT_SCHEMA = {
  "bid_number": "",
  "title": "",
  "due_date": "",
  "bid_submission_type": "",
  "term_of_bid": "",
  "pre_bid_meeting": "",
  "installation": "",
  "bid_bond_requirement": "",
  "delivery_date": "",
  "payment_terms": "",
  "additional_documentation_required": "",
  "mfg_for_registration": "",
  "contract_or_cooperative_to_use": "",
  "model_no": "",
  "part_no": "",
  "product": "",
  "contact_info": "",
  "company_name": "",
  "bid_summary": "",
  "product_specification": "",
  "_metadata": {
    "extraction_timestamp": "",
    "confidence": None,
    "success": None,
    "processing_time": None,
    "validation": {
      "is_valid": None,
      "missing_required_fields": [],
      "data_quality_score": None,
      "warnings": []
    }
  }
}
required_fields = [k for k in STANDARD_OUTPUT_SCHEMA.keys() if k != "_metadata"]
print("✅ Schema keys:", len(STANDARD_OUTPUT_SCHEMA))


✅ Schema keys: 21


In [39]:
import json
from datetime import datetime

JSON_INSTRUCTIONS = f"""
You are an expert procurement analyst. Extract information ONLY from the document.
Return ONE JSON object with EXACTLY these keys and structure (no extras, no commentary):

{json.dumps(STANDARD_OUTPUT_SCHEMA, indent=2)}

STRICT RULES:
- If a field is not present, use an empty string "" (we post-process to 'Not specified').
- NEVER invent facts. If unknown, leave "".
- Keep the original date format if present.
- Use short, professional phrasing for summaries/specifications.
- "_metadata":
  - Fill "extraction_timestamp" as ISO 8601 (UTC).
  - Provide "confidence" (0..1): your self-estimate of extraction correctness.
  - Set "success" true/false.
  - "processing_time" in seconds (estimate is OK).
  - "validation" can be your self-check notes.

Return ONLY raw JSON (no markdown, no backticks).
"""
print("✅ Prompt ready.")


✅ Prompt ready.


In [40]:
import re, time

def validate_and_score(data: dict) -> dict:
    """
    Scores quality based on:
    - presence of required fields (coverage)
    - plausibility checks (email, phone, date-like)
    - richness for summary/spec (>20 chars)
    Produces a 0..1 'data_quality_score' and updates _metadata.confidence if model omitted it.
    """
    coverage = sum(1 for k in required_fields if str(data.get(k, "")).strip() != "")
    coverage_score = coverage / len(required_fields)

    text = " ".join(str(data.get(k, "")) for k in required_fields)
    email_hit = 1 if re.search(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", text, re.I) else 0
    phone_hit = 1 if re.search(r"\b(\+?\d[\d\-\s]{6,})\b", text) else 0
    date_hit  = 1 if re.search(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|[A-Z][a-z]+ \d{1,2}, \d{4})\b", text) else 0

    richness = 0
    if len(data.get("bid_summary","")) >= 20: richness += 0.5
    if len(data.get("product_specification","")) >= 20: richness += 0.5

    quality = 0.5*coverage_score + 0.15*email_hit + 0.1*phone_hit + 0.1*date_hit + 0.15*richness
    quality = max(0.0, min(1.0, quality))

    md = data.setdefault("_metadata", {})
    val = md.setdefault("validation", {})
    val["is_valid"] = True
    val["missing_required_fields"] = [k for k in required_fields if str(data.get(k,"")).strip()==""]
    val["data_quality_score"] = round(quality, 2)
    val.setdefault("warnings", [])
    # If model didn't supply confidence or gave 0, backfill with quality (slightly downscaled)
    if not isinstance(md.get("confidence"), (int, float)) or md["confidence"] is None or md["confidence"] == 0:
        md["confidence"] = round(max(0.1, quality*0.9), 2)
    return data

def normalize_not_specified(data: dict) -> dict:
    """Convert any empty string/None to 'Not specified' except inside _metadata."""
    fixed = {}
    for k,v in data.items():
        if k == "_metadata":
            fixed[k] = v
        elif v is None or (isinstance(v, str) and v.strip() == ""):
            fixed[k] = "Not specified"
        else:
            fixed[k] = v
    return fixed

print("✅ Validator & normalizer ready.")


✅ Validator & normalizer ready.


In [41]:
def extract_json_from_text(text: str) -> dict:
    start = time.perf_counter()
    try:
        resp = model.generate_content([
            {"role": "user", "parts": [
                {"text": JSON_INSTRUCTIONS},
                {"text": "DOCUMENT TEXT:\n\n" + text[:120000]}
            ]}
        ])
        raw = (resp.text or "").strip()
        # Defensive un-fencing
        raw = raw.strip("`").strip()
        if raw.lower().startswith("json"):
            raw = raw[4:].strip()

        data = json.loads(raw)

        # ensure schema completeness
        for k in STANDARD_OUTPUT_SCHEMA:
            if k not in data:
                data[k] = STANDARD_OUTPUT_SCHEMA[k]

        # Fill/adjust metadata
        md = data.setdefault("_metadata", {})
        md.setdefault("extraction_timestamp", datetime.utcnow().isoformat())
        md.setdefault("success", True)
        md.setdefault("processing_time", round(time.perf_counter() - start, 3))
        md.setdefault("confidence", None)  # model may set; validator will backfill if needed

        # Validate + compute data_quality_score + finalize confidence
        data = validate_and_score(data)
        return data
    except Exception as e:
        fail = {**STANDARD_OUTPUT_SCHEMA}
        fail["_metadata"] = {
            **STANDARD_OUTPUT_SCHEMA["_metadata"],
            "extraction_timestamp": datetime.utcnow().isoformat(),
            "success": False,
            "processing_time": round(time.perf_counter() - start, 3),
            "confidence": 0.1,
            "validation": {
                "is_valid": False,
                "missing_required_fields": required_fields,
                "data_quality_score": 0.0,
                "warnings": [f"LLM exception: {e}"]
            }
        }
        return fail

print("✅ LLM extractor ready.")


✅ LLM extractor ready.


In [42]:
files = sorted(glob.glob(os.path.join(DATA_DIR, "*")))
print("Found", len(files), "files")
for f in files:
    print(" -", os.path.basename(f))


Found 9 files
 - Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf
 - Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf
 - Contract_Affidavit.pdf
 - Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html
 - Dell_Laptop_Specs.pdf
 - JA-207652 Student and Staff Computing Devices FINAL.pdf
 - Mercury_Affidavit.pdf
 - PORFP_-_Dell_Laptop_Final.pdf
 - Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html


In [43]:
assert files, "No files found in data/. Add your 9 files, then rerun."

test_file = files[0]
kind, text = read_file_as_text(test_file)
print("Reading:", os.path.basename(test_file), "| kind:", kind, "| chars:", len(text))

result = extract_json_from_text(text)
result = normalize_not_specified(result)

print("confidence:", result["_metadata"].get("confidence"))
print("data_quality_score:", result["_metadata"]["validation"].get("data_quality_score"))
print("Top-level keys:", list(result.keys())[:6], "...")

out_path = os.path.join(OUTPUT_DIR, os.path.splitext(os.path.basename(test_file))[0] + "_result.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2)
print("✅ Saved:", out_path)


Reading: Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf | kind: pdf | chars: 13075
confidence: 0.95
data_quality_score: 0.5
Top-level keys: ['bid_number', 'title', 'due_date', 'bid_submission_type', 'term_of_bid', 'pre_bid_meeting'] ...
✅ Saved: outputs\Addendum 1 RFP JA-207652 Student and Staff Computing Devices_result.json


  md.setdefault("extraction_timestamp", datetime.utcnow().isoformat())


In [44]:
outputs = []

for p in files:
    print("Processing:", os.path.basename(p))
    kind, text = read_file_as_text(p)
    data = extract_json_from_text(text)
    data = normalize_not_specified(data)
    out_path = os.path.join(OUTPUT_DIR, os.path.splitext(os.path.basename(p))[0] + "_result.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(" -> saved:", out_path, "| confidence:", data["_metadata"].get("confidence"))
    outputs.append(out_path)

print("\n✅ Done. Total:", len(outputs))


Processing: Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf


  md.setdefault("extraction_timestamp", datetime.utcnow().isoformat())


 -> saved: outputs\Addendum 1 RFP JA-207652 Student and Staff Computing Devices_result.json | confidence: 0.95
Processing: Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf
 -> saved: outputs\Addendum 2 RFP JA-207652 Student and Staff Computing Devices_result.json | confidence: 0.95
Processing: Contract_Affidavit.pdf
 -> saved: outputs\Contract_Affidavit_result.json | confidence: 0.95
Processing: Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html
 -> saved: outputs\Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct_result.json | confidence: 0.95
Processing: Dell_Laptop_Specs.pdf
 -> saved: outputs\Dell_Laptop_Specs_result.json | confidence: 0.95
Processing: JA-207652 Student and Staff Computing Devices FINAL.pdf
 -> saved: outputs\JA-207652 Student and Staff Computing Devices FINAL_result.json | confidence: 0.95
Processing: Mercury_Affidavit.pdf
 -> saved: outputs\Mercury_Affidavit_result.json | confidence: 0.95
Processing

In [45]:
sample = outputs[-1] if outputs else None
assert sample, "No outputs yet."
with open(sample, "r", encoding="utf-8") as f:
    data = json.load(f)

print("Preview keys:", list(data.keys()))
print("\ncompany_name:", data.get("company_name"))
print("due_date:", data.get("due_date"))
print("confidence:", data["_metadata"].get("confidence"))
print("\n_metadata:\n", json.dumps(data.get("_metadata", {}), indent=2))


Preview keys: ['bid_number', 'title', 'due_date', 'bid_submission_type', 'term_of_bid', 'pre_bid_meeting', 'installation', 'bid_bond_requirement', 'delivery_date', 'payment_terms', 'additional_documentation_required', 'mfg_for_registration', 'contract_or_cooperative_to_use', 'model_no', 'part_no', 'product', 'contact_info', 'company_name', 'bid_summary', 'product_specification', '_metadata']

company_name: Dallas Independent School District
due_date: 07/09/2024 03:00 PM EDT
confidence: 0.95

_metadata:
 {
  "extraction_timestamp": "2024-06-04T18:29:01Z",
  "confidence": 0.95,
  "success": true,
  "processing_time": 5,
  "validation": {
    "is_valid": true,
    "missing_required_fields": [
      "installation",
      "bid_bond_requirement",
      "delivery_date",
      "payment_terms",
      "additional_documentation_required",
      "mfg_for_registration",
      "contract_or_cooperative_to_use",
      "model_no",
      "part_no"
    ],
    "data_quality_score": 0.78,
  }
}
