In [1]:
!pip install crewai -r ../requirements.txt

Collecting crewai
  Downloading crewai-0.67.1-py3-none-any.whl.metadata (15 kB)
Collecting uvicorn==0.30.1 (from -r ../requirements.txt (line 1))
  Downloading uvicorn-0.30.1-py3-none-any.whl.metadata (6.3 kB)
Collecting fastapi==0.110.3 (from -r ../requirements.txt (line 2))
  Downloading fastapi-0.110.3-py3-none-any.whl.metadata (24 kB)
Collecting python-dotenv==1.0.0 (from -r ../requirements.txt (line 3))
  Downloading python_dotenv-1.0.0-py3-none-any.whl.metadata (21 kB)
Collecting crewai
  Downloading crewai-0.51.1-py3-none-any.whl.metadata (14 kB)
Collecting langchain==0.2.15 (from -r ../requirements.txt (line 5))
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-aws==0.1.17 (from -r ../requirements.txt (line 6))
  Downloading langchain_aws-0.1.17-py3-none-any.whl.metadata (3.2 kB)
Collecting sqlalchemy==2.0.31 (from -r ../requirements.txt (line 7))
  Downloading SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [1]:
import dotenv
assert dotenv.load_dotenv()

In [2]:
# Import required libraries
import os
from langchain_aws import ChatBedrock

# Set up the model ID for Claude
#MODEL_ID = "meta.llama3-8b-instruct-v1:0"
#MODEL_ID = "meta.llama3-70b-instruct-v1:0"
#MODEL_ID = "mistral.mistral-7b-instruct-v0:2"
#MODEL_ID = "mistral.mixtral-8x7b-instruct-v0:1"
MODEL_ID2 = "anthropic.claude-3-haiku-20240307-v1:0"
MODEL_ID = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# Initialize the ChatBedrock instance
llm = ChatBedrock(model_id=MODEL_ID, model_kwargs={'temperature': 0})
llm2 = ChatBedrock(model_id=MODEL_ID2, model_kwargs={'temperature': 0})

In [3]:
message = [
    ("system", "You are a helpful assistant that provides concise information on PIRLS 2021 results."),
    ("human", "What impact did COVID-19 have on reading abilities?'")
]

response = llm.invoke(message)
print(response)

content="Based on the PIRLS 2021 results, COVID-19 generally had a negative impact on reading abilities:\n\n1. Overall decline: Many countries saw a decline in average reading scores compared to previous PIRLS cycles.\n\n2. Varied impact: The extent of the impact varied across countries, with some experiencing more significant declines than others.\n\n3. Learning loss: School closures and disruptions to education likely contributed to learning losses in reading skills.\n\n4. Widened gaps: Existing achievement gaps between advantaged and disadvantaged students may have widened in some cases.\n\n5. Digital divide: Differences in access to technology and online learning resources during lockdowns potentially exacerbated inequalities.\n\nHowever, it's important to note that not all countries experienced declines, and the full long-term impact of COVID-19 on reading abilities is still being studied." additional_kwargs={'usage': {'prompt_tokens': 40, 'completion_tokens': 190, 'total_tokens':

In [4]:
# Imports
import os
from crewai import Agent, Crew, Process, Task
from crewai.project import agent, crew, task
from langchain_aws import ChatBedrock
from langchain_core.tools import tool

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
os.listdir()

['DS01_Test_Web_Crawler_Agent.ipynb',
 'DS01_EDA.ipynb',
 'tools',
 'documents',
 'tasks_dev.yaml',
 'DS01_Test_Agentic_System.ipynb',
 'static',
 '.ipynb_checkpoints',
 'agents_dev.yaml']

In [6]:
# !pip install 'crewai[tools]'

In [13]:
import tools.database as db_tools
import tools.web_crawl as web_tools

In [8]:
from crewai import Agent, Crew, Process, Task
from crewai.project import CrewBase, agent, crew, task

In [9]:
from pathlib import Path

In [10]:
PROJECT_ROOT = Path('.').resolve()
PROJECT_ROOT

PosixPath('/home/ec2-user/SageMaker/GDSC/dev')

In [15]:
@CrewBase
class DevPIRLSCrew:
    """Data Analysis Crew for the GDSC project."""
    # Load the files from the config directory
    agents_config = PROJECT_ROOT / 'agents_dev.yaml'
    tasks_config = PROJECT_ROOT / 'tasks_dev.yaml'

    def __init__(self, llm):
        self.llm = llm

    def run(self, prompt: str) -> str:
        return self.crew().kickoff(inputs={'user_question': prompt}).raw

    @agent
    def lead_data_analyst(self) -> Agent:
        a = Agent(
            config=self.agents_config['lead_data_analyst'],
            llm=self.llm,
            allow_delegation=False,
            verbose=True,
            tools=[
                web_tools.scrape_paragraph_text
            ]
        )
        return a

    @agent
    def data_engineer(self) -> Agent:
        a = Agent(
            config=self.agents_config['data_engineer'],
            llm=self.llm,
            allow_delegation=False,
            verbose=True,
            tools=[
                db_tools.query_database,
                db_tools.get_possible_answers_to_question,
                db_tools.get_questions_of_given_type
            ]
        )
        return a
    
    # @agent
    # def web_researcher(self) -> Agent:
    #     a = Agent(
    #         config=self.agents_config['web_researcher'],
    #         llm=llm2,
    #         allow_delegation=False,
    #         verbose=True,
    #         tools=[
    #             web_tools.find_relevant_links
    #         ]
    #     )
    #     return a
    
    # @agent
    # def report_compiler(self) -> Agent:
    #     a = Agent(
    #         config=self.agents_config['report_compiler'],
    #         llm=self.llm,
    #         allow_delegation=False,
    #         verbose=True
    #     )
    #     return a

    @task
    def answer_question_task(self) -> Task:
        t = Task(
            config=self.tasks_config['answer_question_task'],
            agent=self.data_engineer()
        )
        return t
    
    # @task
    # def find_relevant_links_task(self) -> Task:
    #     t = Task(
    #         config=self.tasks_config['find_relevant_links_task'],
    #         agent=self.web_researcher()
    #     )
    #     return t
    
    @task
    def analyze_findings_task(self) -> Task:
        t = Task(
            config=self.tasks_config['analyze_findings_task'],
            agent=self.lead_data_analyst()
        )
        return t

    @crew
    def crew(self) -> Crew:
        """Creates the data analyst crew"""
        return Crew(
            agents=self.agents,
            tasks=self.tasks,
            process=Process.sequential,
            verbose=True,
            max_iter=5,
            cache=True
        )

In [18]:
pythonCrew = DevPIRLSCrew(llm=llm)

print(pythonCrew.run("Give me a chocolate cake recipe.")) # According to the PIRLS 2021 study, how did the COVID pandemic impact students' reading habits? , Are boys or girls most lagging behind in reading abilities?

[1m[95m [2024-10-09 22:16:18][DEBUG]: == Working Agent: data engineer
[00m
[1m[95m [2024-10-09 22:16:18][INFO]: == Starting Task: Answer the following question:     Give me a chocolate cake recipe.
When applicable, search for relevant data in the PIRLS 2021 dataset.
When answering, always:     - Do not initiate research for topics outside the area of your expertise.      - Ensure that your dataset queries are accurate and relevant to the research questions. - Unless instructed otherwise, explain how you come to your conclusions and provide evidence to support your claims with specific data. - Prioritize specific findings including numbers and percentages in line with best practices in statistics - Data and numbers should be provided in tables to increase readability. - Try to go the extra mile for open questions (e.g. correlate data with socioeconomic status, compare across countries within a region, integrate suggestions that you have into your query)
[00m


[1m> Entering new C

In [11]:
# from crewai import Agent, tool
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

In [61]:
def read_text_from_website(url: str, query: str) -> list:
    """
    Reads all text from the given URL and its subpages, filtering out non-text content,
    and finds subpages relevant to the user query.

    Args:
        url (str): The URL of the website to read text from.
        query (str): The user query to search for relevant subpages.

    Returns:
        list: A list of tuples containing the text of relevant subpages and their URLs.
    """
    relevant_subpages = []
    
    def extract_text(soup: BeautifulSoup) -> str:
        """
        Extracts readable text from a BeautifulSoup object, filtering out non-text content.

        Args:
            soup (BeautifulSoup): The BeautifulSoup object to extract text from.

        Returns:
            str: The extracted readable text.
        """
        text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
        text = ' '.join(element.get_text() for element in text_elements)
        return text

    def find_relevant_subpages(text: str, url: str) -> None:
        """
        Finds subpages relevant to the user query and stores them with their URLs.

        Args:
            text (str): The text to search for relevant subpages.
            url (str): The URL of the webpage where the text was found.
        """
        if query.lower() in text.lower():
            relevant_subpages.append((text, url))

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    main_text = extract_text(soup)
    find_relevant_subpages(main_text, url)
    
    # Find all links to subpages and read their text
    for link in soup.find_all('a', href=True):
        subpage_url = urljoin(url, link['href'])
        subpage_response = requests.get(subpage_url)
        subpage_soup = BeautifulSoup(subpage_response.content, 'html.parser')
        subpage_text = extract_text(subpage_soup)
        find_relevant_subpages(subpage_text, subpage_url)
    
    return relevant_subpages

# Example usage
# url = "https://example.com"
# query = "specific topic"
# results = read_text_from_website(url, query)
# for text, link in results:
#     print(f"Found subpage: {link}\nText: {text[:200]}...\n")  # Print first 200 characters for brevity

In [22]:
# Example usage
url = "https://pirls2021.org/results"
query = "COVID-19"
results = read_text_from_website(url, query)
for text, link in results:
    print(f"Found subpage: {link}\nText: {text}...\n")  # Print first 200 characters for brevity



Found subpage: https://pirls2021.org/results
Text: 
		Main Navigation	 Home About

About the Center
Staff
Graduate Students
News
Contact

 About the Center Staff Graduate Students News Contact TIMSS

2023
2019
2015
2011
2007
2003
1999
1995
ADVANCED
2015
2008
1995

 2023 2019 2015 2011 2007 2003 1999 1995 ADVANCED 2015 2008 1995 PIRLS

2026
2021
2016
2011
2006
2001

 2026 2021 2016 2011 2006 2001 Other Projects

LaNA Special Administration 2023
Rosetta Stone 2019 – PASEC
Rosetta Stone 2019 – ERCE

 LaNA Special Administration 2023 Rosetta Stone 2019 – PASEC Rosetta Stone 2019 – ERCE Publications Databases IEA 

 PIRLS 2021 Home Assessment Frameworks Encyclopedia Introduction Curriculum Questionnaire Exhibits Country Chapters Download Center PIRLS 2021 International Results
 Methods & Procedures Context Questionnaires International Database PIRLS 2021 International Results in Reading – About PIRLS 2021 PIRLS 2021 International Results in Reading “PIRLS 2021 provides the only internationa