In [1]:
import os
import pandas as pd
import re
import json
import pdfplumber
from bs4 import BeautifulSoup

In [2]:
# Function to get all text from a PDF file
def get_pdf_text(filepath):
    text = ""
    try:
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                content = page.extract_text()
                if content:
                    text += content + "\n"
    except Exception as error:
        print(f"Could not read {filepath}: {error}")
    return text

In [3]:
# Function to get all text from an HTML file
def get_html_text(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as file:
            html_content = file.read()
            soup = BeautifulSoup(html_content, "html.parser")
            return soup.get_text(separator=" ")
    except Exception as error:
        print(f"Could not read {filepath}: {error}")
        return ""

In [4]:
# Function to find RFP information within extracted text
def find_rfp_info(text):
    result = {}
    result["Bid Number"] = match_or_blank(r"(JA-\d{6}|BPM\d+|E20P\d+)", text)
    result["Title"] = match_or_blank(r"(Student and Staff Computing Devices|Dell Laptops.*Warranty)", text)
    result["Due Date"] = match_or_blank(r"\d{2}/\d{2}/\d{4}", text)
    result["Bid Submission Type"] = match_or_blank(r"(BidNet|iSupplier|eMaryland Marketplace)", text)
    result["Term of Bid"] = match_or_blank(r"\d+\s*year|\d+\s*month", text)
    result["Pre Bid Meeting"] = match_or_blank(r"(Pre[- ]?Bid Meeting.*\d{4})", text)
    result["Installation"] = match_or_blank(r"(installation|white glove deployment|asset tagging)", text)
    result["Bid Bond Requirement"] = match_or_blank(r"(bond requirement|small business reserve)", text)
    result["Delivery Date"] = match_or_blank(r"delivery.*\d{4}|\d+\s*days.*award", text)
    result["Payment Terms"] = match_or_blank(r"net\s*\d+", text)
    result["Any Additional Documentation Required"] = match_or_blank(r"(addendum|mwbe forms|affidavit|W9|authorization)", text)
    result["MFG for Registration"] = match_or_blank(r"(manufacturer registration|Dell|vendor registration)", text)
    result["Contract or Cooperative to use"] = match_or_blank(r"(contract.*\d+|cooperative.*agreement|EPCNT|CTPA)", text)
    result["Model_no"] = match_or_blank(r"(Latitude\s*\d+|WD22TB4|SI CC\d+)", text)
    result["Part_no"] = match_or_blank(r"(SI CC\d+|WD22TB4)", text)
    result["Product"] = match_or_blank(r"(student laptops|Chromebooks|Dell Latitude.*Dock|Laptop)", text)
    result["contact_info"] = match_or_blank(r"[\w\.-]+@[\w\.-]+", text)
    result["company_name"] = match_or_blank(r"(Dallas Independent School District|State of Maryland)", text)
    result["Bid Summary"] = match_or_blank(r"(RFP.*devices|laptop refresh.*employees)", text)
    result["Product Specification"] = match_or_blank(r"(Chromebook|Windows|Apple|Latitude 5550|Dock|copilot ready|extended warranty)", text)
    return result

In [5]:
# Small helper so blank values never show up as None
def match_or_blank(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(0).strip() if match else ""

In [6]:
# Main function: scan a folder, extract each file's info
def scan_rfp_folder(folder_path):
    output = []
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if filename.lower().endswith(".pdf"):
            text = get_pdf_text(filepath)
        elif filename.lower().endswith(".html"):
            text = get_html_text(filepath)
        else:
            continue
        info = find_rfp_info(text)
        info["Source File"] = filename
        output.append(info)
    return output

In [7]:
# Main Extraction of the Assignment
if __name__ == "__main__":
    folder = "./rfp_docs"
    results = scan_rfp_folder(folder)
    with open("structured_output.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)
    print("Extraction finished. See 'structured_output.json' for results.")

Extraction finished. See 'structured_output.json' for results.


In [8]:
# Open the saved JSON file and load the data
with open("structured_output.json", "r", encoding="utf-8") as file:
    records = json.load(file)

In [11]:
# Print the loaded data in an easy-to-read way
print(json.dumps(records, indent=4))

[
    {
        "Bid Number": "JA-207652",
        "Title": "Student and Staff Computing Devices",
        "Due Date": "",
        "Bid Submission Type": "",
        "Term of Bid": "3 year",
        "Pre Bid Meeting": "",
        "Installation": "installation",
        "Bid Bond Requirement": "",
        "Delivery Date": "",
        "Payment Terms": "",
        "Any Additional Documentation Required": "ADDENDUM",
        "MFG for Registration": "",
        "Contract or Cooperative to use": "",
        "Model_no": "",
        "Part_no": "",
        "Product": "Laptop",
        "contact_info": "",
        "company_name": "",
        "Bid Summary": "RFP JA-207652 Student and Staff Computing Devices",
        "Product Specification": "Chromebook",
        "Source File": "Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf"
    },
    {
        "Bid Number": "JA-207652",
        "Title": "Student and Staff Computing Devices",
        "Due Date": "",
        "Bid Submission Type

In [13]:
# Display full contents elegantly
pd.DataFrame(records)

Unnamed: 0,Bid Number,Title,Due Date,Bid Submission Type,Term of Bid,Pre Bid Meeting,Installation,Bid Bond Requirement,Delivery Date,Payment Terms,...,MFG for Registration,Contract or Cooperative to use,Model_no,Part_no,Product,contact_info,company_name,Bid Summary,Product Specification,Source File
0,JA-207652,Student and Staff Computing Devices,,,3 year,,installation,,,,...,,,,,Laptop,,,RFP JA-207652 Student and Staff Computing Devices,Chromebook,Addendum 1 RFP JA-207652 Student and Staff Com...
1,JA-207652,Student and Staff Computing Devices,,,,,,,,,...,,,,,,,,RFP JA-207652 Student and Staff Computing Devices,,Addendum 2 RFP JA-207652 Student and Staff Com...
2,,,,,24 month,,,,,,...,,"contract term, on or before: (i) May 31, to co...",,,,,State of Maryland,,,Contract_Affidavit.pdf
3,BPM044557,Dell Laptops w/Extended Warranty,05/28/2024,BidNet,,,,,,,...,Dell,,Latitude 5550,WD22TB4,Laptop,Thawkins@treasurer.state.md.us,State of Maryland,,Extended Warranty,Dell Laptops w_Extended Warranty - Bid Informa...
4,,,,,,,,,,,...,Dell,,Latitude 5550,,,Tax_Department@dell.com,,,Latitude 5550,Dell_Laptop_Specs.pdf
5,JA-207652,Student and Staff Computing Devices,,iSupplier,5 year,,installation,bond requirement,,,...,,contractor 1,,,laptop,JALZATE@dallasisd.org,Dallas Independent School District,RFP) will be for student and staff computing d...,Chromebook,JA-207652 Student and Staff Computing Devices ...
6,,,,,,,,,,,...,,"contract, and (3",,,,,State of Maryland,,,Mercury_Affidavit.pdf
7,E20P4600040,,05/24/2024,eMaryland Marketplace,3 year,,,SMALL BUSINESS RESERVE,45 days of Award,,...,Dell,,Latitude 5550,WD22TB4,Laptop,thawkins@treasurer.state.md.us,,,Extended Warranty,PORFP_-_Dell_Laptop_Final.pdf
8,JA-207652,Student and Staff Computing Devices,05/29/2024,BidNet,,,,,,,...,,,,,laptop,ProcurementCS@dallasisd.org,Dallas Independent School District,RFP) will be for student and staff computing d...,,Student and Staff Computing Devices __SOURCING...


In [14]:
# COMPLETED