In [1]:
# Import necessary libraries
import os
import json
import PyPDF2
from typing import List, Dict
import pandas as pd
from dotenv import load_dotenv

# LlamaIndex imports
from llama_index.core import Document, Settings, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Load environment variables from .env file
load_dotenv()

# Verify OpenAI API key is set
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")
    
print("Environment setup complete!")

Environment setup complete!


In [3]:
# Set up LlamaIndex settings with OpenAI LLM and Embeddings
def initialize_settings():
    # Configure the embedding model (text-embedding-3-small)
    embed_model = OpenAIEmbedding(
        model="text-embedding-3-small",
        api_key=os.getenv("OPENAI_API_KEY"),
        dimensions=1536
    )
    
    # Configure the LLM (gpt-4o-mini)
    llm = OpenAI(
        model="gpt-4o-mini",
        temperature=0.1,
        api_key=os.getenv("OPENAI_API_KEY")
    )
    
    # Set up LlamaIndex settings using the new approach
    Settings.embed_model = embed_model
    Settings.llm = llm
    Settings.chunk_size = 1000
    Settings.chunk_overlap = 200
    
    # Don't return a ServiceContext object
    return Settings

# Initialize settings
settings = initialize_settings()

# Test the LLM
llm = Settings.llm
response = llm.complete("Hello, can you tell me what model you are?")
print("LLM Test Response:", response)

LLM Test Response: I am based on OpenAI's GPT-3 model. How can I assist you today?


In [4]:
# Step 6: Loading and Processing PDF Documents
def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file using PyPDF2
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() + "\n"
            return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

def load_documents_from_pdfs(pdf_paths):
    """
    Load and process PDFs into LlamaIndex Document objects
    """
    documents = {}
    for doc_type, pdf_path in pdf_paths.items():
        try:
            print(f"Loading {doc_type} from {pdf_path}...")
            text = extract_text_from_pdf(pdf_path)
            if text:
                # Create a LlamaIndex Document
                documents[doc_type] = Document(text=text, metadata={"source": pdf_path, "type": doc_type})
                
                # Save extracted text to a file for verification
                with open(f"extracted_{doc_type}.txt", "w") as f:
                    f.write(text)
                print(f"Saved extracted text to 'extracted_{doc_type}.txt'")
            else:
                print(f"Failed to extract text from {pdf_path}")
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    
    return documents

# Define PDF paths
pdf_paths = {
    "requirements": "Requirements_Specification.pdf",
    "task_estimates": "SampleProjectTasksEstimates.pdf"
}

# Load documents
documents = load_documents_from_pdfs(pdf_paths)

# Print extracted documents statistics
for doc_type, doc in documents.items():
    print(f"\n{doc_type.capitalize()} document:")
    print(f"Length: {len(doc.text)} characters")
    print(f"Preview: {doc.text[:300]}...")

# Store full text separately for direct use
document_store = {
    f"{doc_type}_full": doc.text for doc_type, doc in documents.items()
}

Loading requirements from Requirements_Specification.pdf...
Saved extracted text to 'extracted_requirements.txt'
Loading task_estimates from SampleProjectTasksEstimates.pdf...
Saved extracted text to 'extracted_task_estimates.txt'

Requirements document:
Length: 4994 characters
Preview:  
Chicago WideCast  
Smart -Home Services  
 
 
 
 
 
Author: Atef Bader, PhD  
Last Edit: 7/5/2024  
Image/Model: dall-e-3 
 
 
Project Overview Statement:  
 
Chicago WideCast Smart -Home Services  is a startup company that is 
interested in automating all of its business process workflows utilizi...

Task_estimates document:
Length: 2305 characters
Preview:  
Task  Amount of Work  Productivity  Rate  
Project Plan      
Write Plan  56 pages  5 page s/Hour  
Review Plan      
Preparation for review    4 pages/Hour  
Review Meeting   8 pages/Hour  
Rework  39 defects  5 defects/Hour  
   
Risk Mitigation and Contingency Plan      
Write Plan  78 pages  5...


In [7]:
# Create vector indices for each document type
def create_vector_indices(documents):
    indices = {}
    for doc_type, doc in documents.items():
        print(f"Creating vector index for {doc_type}...")
        # Parse the document into nodes
        parser = SimpleNodeParser.from_defaults()
        nodes = parser.get_nodes_from_documents([doc])
        
        # Create a vector index
        # Uses the global Settings instead of passing service_context
        index = VectorStoreIndex(nodes)
        indices[doc_type] = index
        
        # Save the index for future use
        index.storage_context.persist(f"{doc_type}_index")
        
    return indices

# Create vector indices
indices = create_vector_indices(documents)

# Create query engines for each index
query_engines = {
    doc_type: index.as_query_engine() 
    for doc_type, index in indices.items()
}

# Test the query engines
test_queries = {
    "requirements": "What TV plans does WideCast offer?",
    "task_estimates": "What are the productivity rates for writing plans?"
}

for doc_type, query in test_queries.items():
    print(f"\n{doc_type.capitalize()} query: '{query}'")
    response = query_engines[doc_type].query(query)
    print(f"Response: {response}")

Creating vector index for requirements...
Creating vector index for task_estimates...

Requirements query: 'What TV plans does WideCast offer?'
Response: WideCast offers the following TV plans:

1. Basic - 50 channels
2. BasicPlus – 100 channels
3. Ultimate - 200 channels

Task_estimates query: 'What are the productivity rates for writing plans?'
Response: The productivity rates for writing plans are as follows:

- Project Plan: 5 pages per hour
- Risk Mitigation and Contingency Plan: 5 pages per hour
- Analysis Document: 5 pages per hour
- Design Document (DD): 4 pages per hour
- Test Plan (TP): 6 pages per day
