# agent_document_summarize

In [27]:
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.vectorstores import Chroma
import json
from module.document_processing import load_documents, split_documents
from module.get_model_and_embeding import get_llm, get_embedding
import config

In [35]:
def setup_vectorstore(documents: List[Any], embeddings: Any) -> Any:
    """Create and return a vector store from the documents"""
    return Chroma.from_documents(
        documents=documents,
        embedding=embeddings
    )

In [36]:
def create_prompt_template() -> ChatPromptTemplate:
    """Create and return the prompt template for role/task extraction"""
    return ChatPromptTemplate.from_template(
        template="""
        <system>
        You are a helpful assistant specializing in data extraction from documents.
        </system>

        <user>
        Your task is to extract all roles mentioned in the given documents and their associated tasks.
        Provide your answer as a JSON string where keys are roles and values are lists of tasks.
        Only return the JSON string without any additional explanation or formatting.

        Example format:
        {{"Role1": ["Task1", "Task2", "Task3"], "Role2": ["Task1", "Task2"]}}

        Ensure your output is a valid JSON string that can be parsed directly.
        </user>

        <query>
        Role in my team: {myteam}
        Extract all roles and their associated tasks from the following document:
        {context}
        </query>
        """
    )

In [30]:

def process_document(
    doc: Any,
    chain: Any,
    current_roles: List[str]
) -> Dict[str, List[str]]:
    """Process a single document and return extracted roles and tasks"""
    try:
        # Execute chain
        response = chain.invoke({
            "myteam": ", ".join(current_roles),
            "document_content": doc.page_content
        })
        
        # Parse response
        return json.loads(response)
    except json.JSONDecodeError as e:
        print(f"Error parsing response for document: {e}")
        return {}
    except Exception as e:
        print(f"Error processing document: {e}")
        return {}

In [31]:
def process_single_document(
    doc_content: str,
    current_roles: List[str],
    llm: Any,
    output_parser: Any
) -> Dict[str, List[str]]:
    """Process a single document and return extracted roles and tasks"""
    try:
        # Create prompt template for this specific document
        prompt = ChatPromptTemplate.from_template(
            template="""
            <system>
            You are a helpful assistant specializing in data extraction from documents.
            </system>

            <user>
            Your task is to extract all roles mentioned in the given documents and their associated tasks.
            Provide your answer as a JSON string where keys are roles and values are lists of tasks.
            Only return the JSON string without any additional explanation or formatting.

            Example format:
            {{"Role1": ["Task1", "Task2", "Task3"], "Role2": ["Task1", "Task2"]}}

            Ensure your output is a valid JSON string that can be parsed directly.
            </user>

            <query>
            Role in my team: {myteam}
            Extract all roles and their associated tasks from the following document:
            {document_content}
            </query>
            """
        )

        # Create a simple chain for this document
        chain = prompt | llm | output_parser
        
        # Execute chain with document content
        response = chain.invoke({
            "myteam": ", ".join(current_roles),
            "document_content": doc_content
        })
        
        # Parse response
        return json.loads(response)
    except json.JSONDecodeError as e:
        print(f"Error parsing response for document: {e}")
        return {}
    except Exception as e:
        print(f"Error processing document: {e}")
        return {}

In [33]:

# Example usage
document_paths = [
    r"D:\Mindforge\AIService\test_doc\doc1.pdf"
]

current_roles = [
    "Software Developer",
    "UX Designer",
    "Project Manager"
]

In [34]:
# Process documents
results = process_documents(
    document_paths=document_paths,
    current_roles=current_roles
)


Loading documents...
Loaded 27 documents.
Splitting documents into chunks...
Split into 27 chunks.
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing document: 'dict' object has no attribute 'replace'
Error processing

In [None]:
print(json.dumps(results, indent=2))

# agent_human_management.py