# Knowledge Sources in CrewAI

## String knowledge source

In [1]:
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource

# Define the knowledge
policy_text = """Our return policy allows customers to return any product within 30 days of purchase.
                 Refunds will be issued only if the item is unused and in original packaging.
                 Customers must provide proof of purchase when requesting a return."""

# Create a StringKnowledgeSource object
return_policy_knowledge = StringKnowledgeSource(content=policy_text)


In [2]:
from crewai import LLM

llm = LLM(model="gpt-4o")


In [3]:
from crewai import Agent

returns_agent = Agent(
    role="Product Returns Assistant",
    goal="Answer customer questions about return policy accurately.",
    backstory="You work in customer service and specialize in returns, refunds, and policies.",
    allow_delegation=False,
    verbose=True,
    llm=llm
)


In [4]:
from crewai import Task

returns_task = Task(
    description="Answer the following customer question about returns: {question}",
    expected_output="A concise and accurate answer.",
    agent=returns_agent
)


In [5]:
from crewai import Crew, Process

crew = Crew(
    agents=[returns_agent],
    tasks=[returns_task],
    process=Process.sequential,
    knowledge_sources=[return_policy_knowledge],  # This is key
    verbose=True
)


In [6]:
result = crew.kickoff(inputs={
    "question": "Can I get a refund if I used the item once?"
})

from pprint import pprint
pprint(result.raw)

Output()

Output()

('Our return policy states that refunds will be issued only if the item is '
 'unused and in its original packaging. If you have used the item, '
 'unfortunately, you would not be eligible for a refund. Additionally, any '
 'return must be initiated within 30 days of purchase, and proof of purchase '
 'is required when requesting a return.')


## Web Knowledge Source

In [1]:
from crewai import LLM, Agent, Crew, Process, Task
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource

# Create a knowledge source from web content
content_source = CrewDoclingSource(
    file_paths=[
        "https://lilianweng.github.io/posts/2024-11-28-reward-hacking",
        "https://lilianweng.github.io/posts/2024-07-07-hallucination",
    ],
)

# Create an LLM with a temperature of 0 to ensure deterministic outputs
llm = LLM(model="gpt-4o-mini", temperature=0)

# Create an agent with the knowledge store
agent = Agent(
    role="About papers",
    goal="You know everything about the papers.",
    backstory="You are a master at understanding papers and their content.",
    verbose=True,
    allow_delegation=False,
    llm=llm,
)

task = Task(
    description="Answer the following questions about the papers: {question}",
    expected_output="An answer to the question.",
    agent=agent,
)

crew = Crew(
    agents=[agent],
    tasks=[task],
    verbose=True,
    process=Process.sequential,
    knowledge_sources=[content_source],
)

result = crew.kickoff(
    inputs={"question": "What is the reward hacking paper about? Be sure to provide sources."}
)

2025-10-05 09:14:49,036 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-10-05 09:14:49,049 - INFO - Going to convert document batch...
2025-10-05 09:14:49,049 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-10-05 09:14:49,071 - INFO - Loading plugin 'docling_defaults'
2025-10-05 09:14:49,075 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-05 09:14:49,076 - INFO - Processing document 2024-11-28-reward-hacking
2025-10-05 09:14:49,112 - INFO - Finished converting document 2024-11-28-reward-hacking in 0.85 sec.
2025-10-05 09:14:49,896 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-10-05 09:14:49,907 - INFO - Going to convert document batch...
2025-10-05 09:14:49,908 - INFO - Processing document 2024-07-07-hallucination
2025-10-05 09:14:49,941 - INFO - Finished converting document 2024-07-07-hallucination in 0.83 sec.
2025-10-05 09:14:50,060 - INFO - Anonymized telemetry enabled. See       

[96m
[2025-10-05 09:14:50][0m[93m[ERROR]: [0m[91mFailed to upsert documents: Expected IDs to be unique, found duplicates of: 5ba2636c643741f7d3d6e72d13dea467288cdbb2c0cadb5e38ab6497f862e46b, bc0b8a78a986e022b3b5926820a0b0bd8b1981cc254149c8e87db987329ad412, 48888467a3f93ba7fbdd61dffd02296b539ec4189453710202ed721abe045010 in upsert.[0m
[96m


Output()

[92m09:14:50 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2025-10-05 09:14:50,152 - INFO - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


2025-10-05 09:14:51,604 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:14:51 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:14:51,612 - INFO - Wrapper: Completed Call, calling success_handler


2025-10-05 09:14:52,688 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[92m09:14:52 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2025-10-05 09:14:52,712 - INFO - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


Output()

2025-10-05 09:14:56,230 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:14:56 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:14:56,237 - INFO - Wrapper: Completed Call, calling success_handler


## Text Knowledge Source

In [11]:
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource

text_source = TextFileKnowledgeSource(
    file_paths=["hr_policy.txt"]
)

In [12]:
from crewai import Agent, Task, Crew, Process, LLM

llm = LLM(model="gpt-4o")

hr_agent = Agent(
    role="HR Policy Assistant",
    goal="Answer employee questions about HR policies.",
    backstory="You're a reliable HR knowledge assistant.",
    knowledge_sources=[text_source],
    llm=llm
)

task = Task(
    description="What is the leave policy for new employees?",
    expected_output="A clear summary of the leave policy.",
    agent=hr_agent
)


In [13]:
#import pprint

crew = Crew(
    agents=[hr_agent],
    tasks=[task],
    process=Process.sequential,
    verbose=True
)
result = crew.kickoff()

print(result.raw)


2025-10-05 09:19:21,070 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Output()

[92m09:19:21 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:19:21,080 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


2025-10-05 09:19:21,909 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:19:21 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:19:21,917 - INFO - Wrapper: Completed Call, calling success_handler


2025-10-05 09:19:22,701 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[92m09:19:22 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:19:22,712 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


Output()

2025-10-05 09:19:25,303 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:19:25 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:19:25,310 - INFO - Wrapper: Completed Call, calling success_handler


For new employees in our company, the leave policy is as follows: All full-time employees, including new hires, are entitled to 21 days of paid leave annually. However, new employees become eligible to take this leave only after completing their first 30 days of employment. Sick leave must be notified by 10 AM on the same day it is required. Additionally, remote work is permitted for up to 3 days a week, provided you have manager approval. If you need to take more than 3 consecutive days of leave, you must provide supporting documentation.


## PDF source

In [18]:
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource

pdf_source = PDFKnowledgeSource(
    file_paths=["meeting_notes.pdf"]
)

In [19]:
meeting_summarizer = Agent(
    role="Meeting Note Summarizer",
    goal="Provide concise summaries of weekly meetings.",
    backstory="You help the team stay updated on discussions.",
    knowledge_sources=[pdf_source],
    llm=llm
)

task = Task(
    description="Summarize the key action items from last week's meeting.",
    expected_output="A bullet-point list of action items.",
    agent=meeting_summarizer
)


In [20]:

crew = Crew(
    agents=[meeting_summarizer],
    tasks=[task],
    process=Process.sequential,
    verbose=True
)

result = crew.kickoff()
print(result.raw)


2025-10-05 09:22:28,614 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Output()

[92m09:22:28 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:22:28,627 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


2025-10-05 09:22:29,509 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:22:29 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:22:29,516 - INFO - Wrapper: Completed Call, calling success_handler


2025-10-05 09:22:30,603 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[92m09:22:30 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:22:30,619 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


Output()

2025-10-05 09:22:32,001 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:22:32 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:22:32,003 - INFO - Wrapper: Completed Call, calling success_handler


- Product: Finalize Q2 roadmap (due April 20)
- DS: Conduct feasibility study on CrewAI (due April 18)
- Eng: Migrate ML serving infra to Kubernetes (due May 31)


## CSV source

In [21]:
from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource

csv_source = CSVKnowledgeSource(
    file_paths=["feedback.csv"]
)

In [23]:
feedback_analyst = Agent(
    role="User Feedback Analyst",
    goal="Identify common themes in user feedback.",
    backstory="You specialize in converting raw feedback into insights.",
    knowledge_sources=[csv_source],
    llm=llm
)

task = Task(
    description="What are the three most common complaints users had last month?",
    expected_output="A short list of recurring issues.",
    agent=feedback_analyst
)

In [24]:
crew = Crew(
    agents=[feedback_analyst],
    tasks=[task],
    process=Process.sequential,
    verbose=True
)

result = crew.kickoff()
print(result.raw)


2025-10-05 09:24:45,206 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Output()

[92m09:24:45 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:24:45,219 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


2025-10-05 09:24:46,804 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:24:46 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:24:46,809 - INFO - Wrapper: Completed Call, calling success_handler


2025-10-05 09:24:47,204 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[92m09:24:47 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:24:47,218 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


Output()

2025-10-05 09:24:49,307 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:24:49 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:24:49,308 - INFO - Wrapper: Completed Call, calling success_handler


The three most common complaints users had last month are:
1. The UI feels slow on mobile.
2. Notifications are too frequent and not customizable.
3. It’s hard to find the export option in reports.


## JSON source

In [25]:
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource

json_source = JSONKnowledgeSource(
    file_paths=["company_info.json"]
)

In [26]:
company_expert = Agent(
    role="Company Info Specialist",
    goal="Answer questions about company structure and data.",
    backstory="You are an internal data assistant for org-level queries.",
    # knowledge_sources=[json_source],
    llm=llm
)

task = Task(
    description="How many teams are working on the product and what are their names?",
    expected_output="A list of team names and their sizes.",
    agent=company_expert
)


In [27]:
crew = Crew(
    agents=[company_expert],
    tasks=[task],
    process=Process.sequential,
    verbose=True,
    knowledge_sources=[json_source]
)

result = crew.kickoff()
print(result)


2025-10-05 09:25:24,257 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Output()

[92m09:25:24 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:25:24,280 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


2025-10-05 09:25:26,197 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:25:26 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:25:26,207 - INFO - Wrapper: Completed Call, calling success_handler


2025-10-05 09:25:26,844 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[92m09:25:26 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= gpt-4o; provider = openai
2025-10-05 09:25:26,855 - INFO - 
LiteLLM completion() model= gpt-4o; provider = openai


Output()

2025-10-05 09:25:28,576 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m09:25:28 - LiteLLM:INFO[0m: utils.py:1260 - Wrapper: Completed Call, calling success_handler
2025-10-05 09:25:28,583 - INFO - Wrapper: Completed Call, calling success_handler


There are two teams working on the product at TechNova Inc. The team names and their sizes are as follows: 
1. UX Team - 5 members
2. Analytics Team - 3 members


## Custom embedding model

In [None]:
# ollama_embedder = {
#     "provider": "ollama",
#     "config": {
#         "model": "nomic-embed-text",  # Must match or be compatible with Ollama's supported embedding models
#         "api_url": "http://localhost:11434"
#     }
# }

In [1]:
from crewai import Agent, LLM
from dotenv import load_dotenv
import os
load_dotenv()  # Load environment variables from .env file
#os.environ["OPENAI_API_KEY"] = os.getenv("GEMINI_API_KEY")


llm = LLM(model="gemini/gemini-2.5-flash", verbose=True, temperature=0.5,
          api_key=os.getenv("GEMINI_API_KEY"))

In [None]:
# openrouter_llm = LLM(
#     model='openrouter/qwen/qwq-32b:free',
#     api_key=os.environ["OPENROUTER_API_KEY"],
#     temperature=0
# )

In [2]:
ollama_embedder={
        "provider": "google-generativeai",
        "config": {
            "api_key": os.getenv("GEMINI_API_KEY"),
            "model" : "gemini-embedding-001"
            #"model": "models/embedding-001"
        }
    }

In [3]:
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource

# Internal onboarding FAQ
faq_content = """
- You can access your email via portal.company.com using your employee credentials.
- The standard work hours are from 9am to 5pm, Monday to Friday.
- All reimbursement requests must be submitted by the 5th of the following month.
- For any IT-related issues, contact support@company.com.
"""

# Create a string knowledge source
faq_knowledge = StringKnowledgeSource(content=faq_content, embedder=ollama_embedder)


In [4]:
from crewai import Agent

hr_faq_agent = Agent(
    role="HR Assistant",
    goal="Answer onboarding-related questions for new hires.",
    backstory="You are a helpful assistant who knows everything about internal policies and onboarding processes.",
    allow_delegation=False,
    knowledge_sources=[faq_knowledge],
    verbose=True,
    embedder=ollama_embedder
)


In [5]:
from crewai import Task

task = Task(
    description="Answer this onboarding question: {question}",
    expected_output="A short, accurate answer based on internal HR documentation.",
    agent=hr_faq_agent,
    embedder=ollama_embedder
)


In [6]:
from crewai import Crew, Process

crew = Crew(
    agents=[hr_faq_agent],
    tasks=[task],
    #knowledge_sources=[faq_knowledge],
    embedder=ollama_embedder,
    process=Process.sequential,
    verbose=True
)

result = crew.kickoff(inputs={"question": "What are the working hours and how do I get reimbursed?"})

from pprint import pprint
pprint(result.raw)


E0000 00:00:1759637746.527413 1992954 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Output()

Output()

('The standard working hours are from 9 AM to 5 PM, Monday to Friday. To get '
 'reimbursed for any expenses, you need to submit your reimbursement requests '
 'by the 5th of the following month. If you have any IT-related issues while '
 'submitting your reimbursement, please contact support at '
 'support@company.com. You can access your email through portal.company.com '
 'using your employee credentials for any further assistance or to check '
 'additional details.')


# Custom knowledge source

In [7]:
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from typing import Dict, Any
from pydantic import Field
import requests

class WeatherKnowledgeSource(BaseKnowledgeSource):
    """Knowledge source that fetches weather data from an external API."""

    city: str = Field(description="City for which weather should be fetched")

    def load_content(self) -> Dict[Any, str]:
        try:
            print(f"Fetching weather for {self.city}...")

            # Open-Meteo API (no key needed for basic data)
            endpoint = "https://api.open-meteo.com/v1/forecast"
            params = {
                "latitude": 37.77,  # San Francisco by default
                "longitude": -122.42,
                "current_weather": True
            }

            response = requests.get(endpoint, params=params)
            response.raise_for_status()

            weather_data = response.json().get("current_weather", {})
            formatted = self.validate_content(weather_data)
            return {self.city: formatted}

        except Exception as e:
            raise ValueError(f"Failed to fetch weather data: {str(e)}")

    def validate_content(self, data: dict) -> str:
        if not data:
            return "No weather data available."

        return (
            f"Current weather in {self.city}:\n"
            f"- Temperature: {data.get('temperature')}°C\n"
            f"- Wind Speed: {data.get('windspeed')} km/h\n"
            f"- Weather Code: {data.get('weathercode')}\n"
            f"- Time: {data.get('time')}"
        )

    def add(self) -> None:
        """Process and chunk the content."""
        content = self.load_content()
        for _, text in content.items():
            chunks = self._chunk_text(text)
            self.chunks.extend(chunks)
        self._save_documents()


In [8]:
from crewai import Agent, LLM

weather_knowledge = WeatherKnowledgeSource(city="San Francisco")

weather_agent = Agent(
    role="Weather Reporter",
    goal="Answer questions about the current weather forecast.",
    backstory="You are a friendly meteorologist who provides real-time weather updates.",
    knowledge_sources=[weather_knowledge],
    llm=LLM(model="gpt-4o", temperature=0.0),
    verbose=True
)


In [9]:
from crewai import Task, Crew, Process

task = Task(
    description="What is the current temperature and wind speed in San Francisco?",
    expected_output="A concise weather summary for San Francisco.",
    agent=weather_agent
)

crew = Crew(
    agents=[weather_agent],
    tasks=[task],
    process=Process.sequential,
    verbose=True
)


In [10]:
result = crew.kickoff()
print(result)


Fetching weather for San Francisco...


Output()

Output()

The current weather in San Francisco as of 2025-10-05T04:15 is a temperature of 14.7°C with a wind speed of 11.4 km/h.
