# Local RFP Staffing Requirements Extractor (Azure OpenAI)

This standalone notebook extracts staffing requirements from local RFP documents using Azure's OpenAI service with the GPT-4 model.  Structured outputs can be used if calling OpenAI directly, or once implemented with Azure.  Until then they are not being used, though you will seee commented code to make it work below.

To use, create an extracted_RFP_key_personal folder in the same directory as this notebook and place a folder with the name of the RFP and all associated files in it.  There can be multiple directories, each with it's own set of RFP files.  

The notebook will extract the staffing requirements from each RFP and save results to output folder, by default called extracted_RFP_key_personal.

## Setup

In [None]:
import os
import json
import logging
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from docx import Document
import pypdf

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize Azure OpenAI client
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-06-01"
)

# Set the deployment name for the model
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

## Helper Functions

In [None]:
def read_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    content = ""
    
    if file_extension.lower() == '.pdf':
        with open(file_path, 'rb') as file:
            pdf_reader = pypdf.PdfReader(file)
            for page in pdf_reader.pages:
                content += page.extract_text()
    elif file_extension.lower() == '.docx':
        doc = Document(file_path)
        for para in doc.paragraphs:
            content += para.text + "\n"
    else:
        logging.warning(f"Unsupported file type: {file_path}")
    
    return content

def process_rfp_folder_local(folder_path):
    logging.info(f"Processing RFP folder: {folder_path}")
    rfp_id = os.path.basename(folder_path)
    all_content = ""
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_path.lower().endswith(('.pdf', '.docx')):
            all_content += read_file(file_path) + "\n\n"
    print(all_content);
     
    return rfp_id, all_content


## Azure OpenAI Function with Structured Output

In [None]:
def extract_roles_and_requirements_local(content, rfp_id):
    logging.info(f"Extracting roles and requirements for RFP: {rfp_id}")
 
    response = client.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": "You are an expert in analyzing RFP documents and extracting staffing requirements."},
            {"role": "user", "content": f"Analyze the following RFP content and extract the required roles and their requirements.\n\nRFP Content:\n{content}"}
        ],
        # response_format= {
        #     "type": "json_schema",
        #     "json_schema": {
        #         "name": "RFP_Staff_Requirements",
        #         "schema":{
        #             "type": "object",
        #             "properties": {
        #                 "RFP_ID": {"type": "string"},
        #                 "Title": {"type": "string"},
        #                 "Required_Roles": {
        #                     "type": "array",
        #                     "items": {
        #                         "type": "object",
        #                         "properties": {
        #                             "Role": {"type": "string"},
        #                             "Requirements": {
        #                                 "type": "array",
        #                                 "items": {"type": "string"}
        #                             }
        #                         },
        #                         "required": ["Role", "Requirements"]
        #                     }
        #                 }
        #             },
        #             "required": ["RFP_ID", "Title", "Required_Roles"]
        #         },
        #         "strict":True
        #     }
        # } 
    )
    # extracted_info = json.loads(response.choices[0].message.content)
    extracted_info = response.choices[0].message.content
    #extracted_info["RFP_ID"] = rfp_id
    
    logging.info(f"Token usage for RFP {rfp_id}: {response.usage.total_tokens} tokens")
    
    return extracted_info 



## Main Processing Loop
Processes all RFP folders and saves the extracted information.

In [None]:
def main():
    input_dir = "Input_RFPs"
    output_dir = "Extracted_RFP_key_personnel"
    
    os.makedirs(output_dir, exist_ok=True)
    
    for folder_name in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, folder_name)
        if os.path.isdir(folder_path):
            try:
                rfp_id, content = process_rfp_folder_local(folder_path)
                extracted_info = extract_roles_and_requirements_local(content, rfp_id)
                output_file = os.path.join(output_dir, f"{rfp_id}_extracted_info.txt")
                #with open(output_file, 'w') as f:
                #    json.dump(extracted_info, f, indent=2)
                with open(output_file, 'w') as file:
                    file.write(extracted_info)


                logging.info(f"Extracted information saved to {output_file}")
            
            except Exception as e:
                logging.error(f"Error processing RFP {folder_name}: {str(e)}")

## Run the Extraction Process


In [None]:
if __name__ == "__main__":
    main()

## Conclusion
This notebook has processed all RFP documents in the `Input_RFPs` directory and saved the extracted staffing requirements as JSON files in the `extracted_RFP_key_personnel` directory. Check the console output for any error messages or logs.