In [None]:
!pip install crewai PyPDF2 openai

In [None]:
!pip install --upgrade PyPDF2 crewai gradio

In [None]:
!pip install ratelimit validate-email-address bleach python-dotenv

In [4]:
import os
import sys
import traceback
from crewai import Agent, Task, Crew, Process
import PyPDF2
import re
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import json
import gradio as gr

def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def extract_email(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails[0] if emails else None

class DocumentAnalyzerCrew:
    def __init__(self, document_text, reference_data, api_key):
        os.environ["OPENAI_API_KEY"] = api_key
        
        # Store agent task results
        self.agent_results = {}
        
        self.text_extraction_agent = Agent(
            role="Text Extraction Specialist",
            goal="Extract precise key information from documents",
            backstory="An expert in parsing complex documents and identifying critical information with high accuracy",
            verbose=True,
            allow_delegation=False
        )

        self.comparison_agent = Agent(
            role="Document Comparison Expert",
            goal="Compare extracted document data against reference standards",
            backstory="A meticulous analyst specializing in identifying discrepancies between document contents and expected standards",
            verbose=True,
            allow_delegation=False
        )

        self.reporting_agent = Agent(
            role="Detailed Report Generator",
            goal="Create comprehensive and clear analysis reports",
            backstory="A skilled communicator who transforms technical findings into clear, actionable insights",
            verbose=True,
            allow_delegation=False
        )

        self.document_text = document_text
        self.reference_data = reference_data

    def analyze_document(self):
        # Text Extraction Task
        extraction_task = Task(
            description=f"""
            Carefully extract key information from the following document text.
            Focus on finding these specific items: {', '.join(self.reference_data.keys())}

            Document Text:
            {self.document_text[:2000]}
            """,
            agent=self.text_extraction_agent,
            expected_output="A detailed dictionary of extracted key-value pairs from the document"
        )

        # Comparison Task
        comparison_task = Task(
            description=f"""
            Compare the extracted data against these reference standards:
            {json.dumps(self.reference_data, indent=2)}

            Identify any mismatches, missing information, or potential discrepancies.
            Provide a detailed explanation for each difference found.
            """,
            agent=self.comparison_agent,
            expected_output="A comprehensive list of mismatches and detailed analysis"
        )

        # Reporting Task
        reporting_task = Task(
            description="""
            Generate a comprehensive report summarizing the document analysis.
            Include:
            1. Extracted key information
            2. Comparison results
            3. Detailed insights and recommendations
            """,
            agent=self.reporting_agent,
            expected_output="A detailed, structured report with all analysis findings"
        )

        # Create Crew with modified output capturing
        crew = Crew(
            agents=[
                self.text_extraction_agent,
                self.comparison_agent,
                self.reporting_agent
            ],
            tasks=[
                extraction_task,
                comparison_task,
                reporting_task
            ],
            verbose=True  # Enable verbose output
        )

        try:
            # Kick off the crew and capture detailed results
            result = crew.kickoff()
            
            # Collect and structure agent results
            full_report = "📋 Detailed Document Analysis Report\n\n"
            
            # Text Extraction Agent Results
            full_report += "🔍 Text Extraction Agent:\n"
            full_report += str(extraction_task.output) + "\n\n"
            
            # Comparison Agent Results
            full_report += "🔬 Document Comparison Agent:\n"
            full_report += str(comparison_task.output) + "\n\n"
            
            # Reporting Agent Final Result
            full_report += "📝 Final Analysis Report:\n"
            full_report += str(result) + "\n"
            
            return full_report
        except Exception as e:
            return f"Analysis Error: {str(e)}\n{traceback.format_exc()}"

def send_email(recipient_email, analysis_result):
    sender_email = os.getenv("SMTP_EMAIL")
    sender_password = os.getenv("SMTP_PASSWORD")

    if not all([sender_email, sender_password]):
        return False, "Email credentials not found in environment variables"

    message = MIMEMultipart()
    message["From"] = sender_email
    message["To"] = recipient_email
    message["Subject"] = "Document Analysis Report"

    message.attach(MIMEText(analysis_result, "plain"))

    try:
        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, message.as_string())
        return True, "Email sent successfully"
    except Exception as e:
        return False, f"Failed to send email: {str(e)}"

def process_document(api_key, pdf_file, send_email_flag):
    # Default reference data
    reference_data = {
        "limitation of liability": "30 days",
        "owner expiry date": "30 days",
        "notice period": "30 days",
        "agreement duration": "12 months",
        "payment terms": "30 days",
        "confidentiality period": "3 years",
        "insurance coverage": "$1,000,000",
        "maximum monthly hours": "120"
    }

    try:
        # Extract text from PDF
        doc_text = extract_text_from_pdf(pdf_file)

        # Extract email if present
        user_email = extract_email(doc_text)

        # Initialize document analyzer
        document_analyzer = DocumentAnalyzerCrew(doc_text, reference_data, api_key)

        # Perform document analysis
        analysis_result = document_analyzer.analyze_document()

        # Optional email sending
        email_status = ""
        if send_email_flag and user_email:
            success, message = send_email(user_email, analysis_result)
            email_status = f"\n\nEmail Status: {message}"

        return analysis_result + email_status

    except Exception as e:
        return f"An error occurred: {str(e)}\n{traceback.format_exc()}"

def launch_gradio_interface():
    # Gradio Interface
    with gr.Blocks(title="Document Analysis Workflow") as demo:
        gr.Markdown("# 📄 Document Analysis Workflow")

        with gr.Row():
            with gr.Column():
                api_key_input = gr.Textbox(
                    label="OpenAI API Key",
                    type="password",
                    placeholder="Enter your OpenAI API key"
                )
                pdf_input = gr.File(
                    label="Upload PDF Document",
                    file_types=[".pdf"]
                )
                send_email_checkbox = gr.Checkbox(
                    label="Send analysis via email",
                    value=False
                )
                analyze_btn = gr.Button("Analyze Document", variant="primary")

            with gr.Column():
                output = gr.Textbox(
                    label="Analysis Result",
                    interactive=False
                )

        # Action
        analyze_btn.click(
            fn=process_document,
            inputs=[api_key_input, pdf_input, send_email_checkbox],
            outputs=output
        )

    return demo

# Launch the Gradio interface
demo = launch_gradio_interface()
demo.launch(debug=True)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


[1m[95m# Agent:[00m [1m[92mText Extraction Specialist[00m
[95m## Task:[00m [92m
            Carefully extract key information from the following document text.
            Focus on finding these specific items: limitation of liability, owner expiry date, notice period, agreement duration, payment terms, confidentiality period, insurance coverage, maximum monthly hours

            Document Text:
            CONSULTING SERVICES AGREEMENT  
 
This Consulting Services Agreement ("Agreement") is made effective as of 15/01/2024, by and between:  
 
ABC Technologies Inc.  
123 Tech Park Avenue  
Silicon Valley, CA 94025  
 
Contact: john.doe@abctech.com  
 
and 
 
XYZ Consulting LLC ("Consultant")  
456 Business Plaza  
New York, NY 10001  
 
KEY TERMS AND CONDITIONS:  
 
1. Agreement Duration:  
   Start Date: 15/01/2024  
   End Date: 14/01/2025  
 
2. Service Scope:  
   - Technical consulting  
   - Project management  
   - Strategic advisory  
 
3. Financial Terms:  
   - Rate

