In [8]:
!pip install requests beautifulsoup4 pandas PyPDF2 pytesseract langchain langchain_community crewai

Collecting crewai
  Downloading crewai-0.41.1-py3-none-any.whl.metadata (13 kB)
Collecting appdirs<2.0.0,>=1.4.4 (from crewai)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting embedchain<0.2.0,>=0.1.114 (from crewai)
  Downloading embedchain-0.1.119-py3-none-any.whl.metadata (9.1 kB)
Collecting instructor==1.3.3 (from crewai)
  Downloading instructor-1.3.3-py3-none-any.whl.metadata (13 kB)
Collecting json-repair<0.26.0,>=0.25.2 (from crewai)
  Downloading json_repair-0.25.3-py3-none-any.whl.metadata (7.9 kB)
Collecting jsonref<2.0.0,>=1.1.0 (from crewai)
  Downloading jsonref-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting opentelemetry-api<2.0.0,>=1.22.0 (from crewai)
  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-http<2.0.0,>=1.22.0 (from crewai)
  Downloading opentelemetry_exporter_otlp_proto_http-1.26.0-py3-none-any.whl.metadata (2.3 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.22

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import io
from langchain.llms import OpenAI

# Ensure you have Tesseract OCR installed and its path set correctly
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'


ModuleNotFoundError: No module named 'langchain_community'

In [4]:
# Function to scrape HTML content
def scrape_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    projects = []
    # Adjust based on the specific structure of the website
    for project in soup.find_all('div', class_='project-item'):
        title = project.find('h2').text
        description = project.find('p', class_='description').text
        deadline = project.find('span', class_='deadline').text
        projects.append({'Title': title, 'Description': description, 'Deadline': deadline})
    return projects

# Function to extract text from PDF
def extract_pdf_text(url):
    response = requests.get(url)
    pdf = PdfReader(io.BytesIO(response.content))
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
    return text

# Function to extract text from images using OCR
def extract_image_text(url):
    response = requests.get(url)
    img = Image.open(io.BytesIO(response.content))
    text = pytesseract.image_to_string(img)
    return text

# Function to summarize text using LLM
def summarize_text(text):
    prompt = f"Summarize the following text: {text}"
    llm = OpenAI(api_key='your_openai_api_key')
    summary = llm.generate(prompt)
    return summary


# Claude

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import io
from langchain.llms import OpenAI
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent
from langchain.prompts import StringPromptTemplate
from langchain import LLMChain
from crewai import Agent, Task, Crew

In [10]:
def summarize_text(text):
    llm = OpenAI(api_key='your_openai_api_key')
    chain = LLMChain(llm=llm, prompt=StringPromptTemplate(
        input_variables=["text"],
        template="Summarize the following text: {text}"
    ))
    summary = chain.run(text=text)
    return summary

In [11]:
def create_agentic_workflow(url):
    tools = [
        Tool(name="Scrape HTML", func=scrape_html, description="Scrape projects from HTML"),
        Tool(name="Extract PDF", func=extract_pdf_text, description="Extract text from PDF"),
        Tool(name="Extract Image", func=extract_image_text, description="Extract text from image using OCR"),
        Tool(name="Summarize", func=summarize_text, description="Summarize extracted text")
    ]

    llm = OpenAI(api_key='your_openai_api_key')
    agent = LLMSingleActionAgent(llm_chain=LLMChain(llm=llm, prompt=StringPromptTemplate(
        input_variables=["input", "tools"],
        template="Given the URL {input}, choose the appropriate tool to extract project information: {tools}"
    )), allowed_tools=[tool.name for tool in tools])

    executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True)
    return executor.run(url)

In [12]:
def create_crew():
    scraper = Agent(
        role="Web Scraper",
        goal="Extract project information from websites",
        backstory="Expert in web scraping and data extraction",
        tools=[scrape_html, extract_pdf_text, extract_image_text]
    )

    analyzer = Agent(
        role="Data Analyzer",
        goal="Analyze and summarize extracted project information",
        backstory="Expert in data analysis and summarization",
        tools=[summarize_text]
    )

    scraping_task = Task(
        description="Scrape project information from the given URL",
        agent=scraper
    )

    analysis_task = Task(
        description="Analyze and summarize the extracted project information",
        agent=analyzer
    )

    crew = Crew(
        agents=[scraper, analyzer],
        tasks=[scraping_task, analysis_task]
    )

    return crew

def process_url_with_crew(url):
    crew = create_crew()
    result = crew.kickoff(url)
    return result

In [13]:
def main():
    print("Project Extraction and Analysis Tool")
    print("====================================")
    
    while True:
        url = input("Enter a URL to process (or 'quit' to exit): ")
        if url.lower() == 'quit':
            break
        
        print("Processing URL:", url)
        print("1. Using Agentic Workflow")
        agentic_result = create_agentic_workflow(url)
        print("Agentic Workflow Result:", agentic_result)
        
        print("\n2. Using CrewAI")
        crew_result = process_url_with_crew(url)
        print("CrewAI Result:", crew_result)
        
        print("\nProcessing complete.")
        print("------------------------------------")

if __name__ == "__main__":
    main()

Project Extraction and Analysis Tool
Processing URL: https://www.gatesfoundation.org/
1. Using Agentic Workflow


  warn_deprecated(


TypeError: Can't instantiate abstract class StringPromptTemplate with abstract method format