In [1]:
!pip install crewai -r ../requirements.txt

Collecting crewai
  Downloading crewai-0.70.1-py3-none-any.whl.metadata (19 kB)
Collecting uvicorn==0.30.1 (from -r ../requirements.txt (line 1))
  Downloading uvicorn-0.30.1-py3-none-any.whl.metadata (6.3 kB)
Collecting fastapi==0.110.3 (from -r ../requirements.txt (line 2))
  Downloading fastapi-0.110.3-py3-none-any.whl.metadata (24 kB)
Collecting python-dotenv==1.0.0 (from -r ../requirements.txt (line 3))
  Downloading python_dotenv-1.0.0-py3-none-any.whl.metadata (21 kB)
Collecting crewai
  Downloading crewai-0.51.1-py3-none-any.whl.metadata (14 kB)
Collecting langchain==0.2.15 (from -r ../requirements.txt (line 5))
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-aws==0.1.17 (from -r ../requirements.txt (line 6))
  Downloading langchain_aws-0.1.17-py3-none-any.whl.metadata (3.2 kB)
Collecting sqlalchemy==2.0.31 (from -r ../requirements.txt (line 7))
  Downloading SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [3]:
os.listdir()

['DS01_Test_Web_Crawler_Agent.ipynb',
 'DS01_EDA.ipynb',
 'tools',
 'tasks_dev.yaml',
 'DS01_Policy_PDF_Reader.ipynb',
 'DS01_Test_Agentic_System.ipynb',
 'static',
 '.ipynb_checkpoints',
 'agents_dev.yaml']

In [8]:
import requests
import os


def download_pdfs(country_names, base_url, download_folder):
    # Create the download folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    for country in country_names:
        pdf_url = base_url.format(country_name=country)
        try:
            pdf_response = requests.get(pdf_url)
            pdf_response.raise_for_status()  # Check if the request was successful
            
            pdf_name = os.path.join(download_folder, f"{country}.pdf")
            with open(pdf_name, 'wb') as pdf_file:
                pdf_file.write(pdf_response.content)
                print(f'Downloaded: {pdf_name}')
        except requests.exceptions.RequestException as e:
            print(f'Failed to download {pdf_url}: {e}')

# List of country names
country_names = [
    "Albania", "Australia", "Austria", "Azerbaijan", "Bahrain", "Belgium-Flemish", "Belgium-French", 
    "Brazil", "Bulgaria", "Canada", "Chinese-Taipei", "Croatia", "Cyprus", "Czech-Republic", "Denmark", "England", 
    "Finland", "France", "Georgia", "Germany", "Hong-Kong-SAR", "Hungary", 
    "Islamic-Republic-of-Iran", "Ireland", "Israel", "Italy", "Jordan", "Kazakhstan", 
    "Latvia", "Lithuania", "Macao-SAR", "Malta", "Montenegro", "Morocco", "Netherlands", 
    "New-Zealand", "North-Macedonia", "Northern-Ireland", "Norway", "Oman", 
    "Poland", "Portugal", "Qatar", "Russian-Federation", 
    "Saudi-Arabia", "Serbia", "Singapore", 
    "Slovak-Republic", "Slovenia", "South-Africa", 
    "Spain", "Sweden", 
    "Turkiye",
    "United-Arab-Emirates",
    "United-States",
    "Uzbekistan"
]

In [9]:
len(country_names)

56

In [11]:
# Base URL with placeholder for country name
base_url = 'https://pirls2021.org/wp-content/uploads/2022/11/{country_name}.pdf'
download_folder = 'policy_documents'

# Download PDFs
download_pdfs(country_names, base_url, download_folder)

Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Albania.pdf: 404 Client Error: Not Found for url: https://pirls2021.org/wp-content/uploads/2022/11/Albania.pdf
Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Australia.pdf: 404 Client Error: Not Found for url: https://pirls2021.org/wp-content/uploads/2022/11/Australia.pdf
Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Austria.pdf: 404 Client Error: Not Found for url: https://pirls2021.org/wp-content/uploads/2022/11/Austria.pdf
Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Azerbaijan.pdf: 404 Client Error: Not Found for url: https://pirls2021.org/wp-content/uploads/2022/11/Azerbaijan.pdf
Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Bahrain.pdf: 404 Client Error: Not Found for url: https://pirls2021.org/wp-content/uploads/2022/11/Bahrain.pdf
Failed to download https://pirls2021.org/wp-content/uploads/2022/11/Belgium-Flemish.pdf: 404 Cl

## Prepare documents

In [13]:
folder_path = 'policy_documents'
pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
# pdf_files

In [14]:
from pypdf import PdfReader
import re

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)
    return text.lower()

In [15]:
def segment_text(text):
    paragraphs = text.split('\n\n')
    return [p.strip() for p in paragraphs if p.strip()]

In [27]:
from langchain_aws import BedrockEmbeddings
import numpy as np

MODEL_ID = "anthropic.claude-3-haiku-20240307-v1:0"

# Initialize the BedrockEmbeddings instance
embedder = BedrockEmbeddings(model_id=MODEL_ID)

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        response = embedder.embed_query(text)
        embeddings.append(response['embeddings'])
    return np.array(embeddings)

In [19]:
import hnswlib
import numpy as np

def create_hnswlib_index(embeddings):
    dimension = embeddings.shape
    num_elements = embeddings.shape
    
    # Initialize the index
    index = hnswlib.Index(space='l2', dim=dimension)
    
    # Initialize the index with the number of elements
    index.init_index(max_elements=num_elements, ef_construction=200, M=16)
    
    # Add embeddings to the index
    index.add_items(embeddings)
    
    return index

def query_hnswlib_index(index, query_embedding, k=5):
    labels, distances = index.knn_query(query_embedding, k=k)
    return labels


In [20]:
all_texts = []
for pdf_path in pdf_files:
    text = preprocess_text(extract_text_from_pdf(pdf_path))
    segments = segment_text(text)
    all_texts.extend(segments)

In [45]:
from langchain_community.document_loaders import RecursiveUrlLoader
loader = RecursiveUrlLoader(
    "https://pirls2021.org",
    # max_depth=2,
    # use_async=False,
    # extractor=None,
    # metadata_extractor=None,
    # exclude_dirs=(),
    # timeout=10,
    # check_response_status=True,
    # continue_on_failure=True,
    # prevent_outside=True,
    # base_url="https://pirls2021.org",
    # ...
)



In [46]:
docs = loader.load()

In [48]:
len(docs)

26

In [47]:
docs[23].metadata

{'source': 'https://pirls2021.org/wp-json/wp/v2/pages/3791',
 'content_type': 'application/json; charset=UTF-8'}

In [32]:
print(docs[0].page_content[:300])

<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
	<link rel="profile" href="http://gmpg.org/xfn/11">
        <link rel="sitemap" type="application/xml" title="Sitemap" href="/sitemap.xml">

	<ti


In [63]:
# from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

# Load HTML
import requests
from bs4 import BeautifulSoup


def scrape_paragraph_text(url: str) -> list:
    """
    Scrapes text from all <p> elements on a webpage that contain a period.

    Args:
        url (str): The URL of the website to scrape.

    Returns:
        list: A list of text content from <p> elements that contain a period.
    """
    paragraph_texts = []

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    for p in soup.find_all('p'):
        text = p.get_text()
        if '.' in text:
            paragraph_texts.append(text)

    return paragraph_texts[:10]

In [70]:
response = scrape_paragraph_text("https://pirls2021.org/results/context-student/like-reading")

In [86]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def crawl_subpages(url: str) -> list:
    """
    Crawls a website starting from the given URL and returns a list of tuples
    containing subpage URLs and their titles, up to a depth of 1.

    Args:
        url (str): The starting URL of the website to crawl.

    Returns:
        list: A list of tuples where each tuple contains a subpage URL and its title.
    """
    def is_same_base_url(start_url, link):
        return link.startswith(start_url) and link != start_url

    def crawl(url, start_url, visited=None, subpages_list=None):
        if visited is None:
            visited = set()
        if subpages_list is None:
            subpages_list = []
        if url in visited:
            return subpages_list
        visited.add(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        subpages = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
        for subpage in subpages:
            if is_same_base_url(start_url, subpage):
                subpage_response = requests.get(subpage)
                subpage_soup = BeautifulSoup(subpage_response.text, 'html.parser')
                title = subpage_soup.title.string if subpage_soup.title else 'No title'
                subpages_list.append((subpage, title))
        return subpages_list

    start_url = url
    return crawl(start_url, start_url)

start_url = 'https://pirls2021.org/results/achievement'
crawl_subpages(start_url)

[('https://pirls2021.org/results/achievement#content',
  'Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021'),
 ('https://pirls2021.org/results/achievement/overall',
  'Results – Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021'),
 ('https://pirls2021.org/results/achievement/',
  'Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021'),
 ('https://pirls2021.org/results/achievement#impact',
  'Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021'),
 ('https://pirls2021.org/results/achievement/overall',
  'Results – Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021'),
 ('https://pirls2021.org/results/achievement/by-gender',
  'Results by Gender – Countries’ Reading Achievement – PIRLS 2021 – PIRLS 2021')]

In [91]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [103]:
import pandas as pd

df = pd.read_csv("trend_data/pirls_trends.csv", sep=";")
df

Unnamed: 0,Country,2021,2016,2011,2006,2001
0,Australia,540,544.0,527.0,,
1,Austria,530,541.0,529.0,538.0,
2,Azerbaijan,440,472.0,462.0,,
3,Belgium (Flemish),511,525.0,,547.0,
4,Belgium (French),494,497.0,506.0,500.0,
5,Bulgaria,540,552.0,532.0,547.0,550.0
6,Chinese Taipei,544,559.0,553.0,535.0,
7,Cyprus,511,,,,494.0
8,Czech Republic,540,543.0,545.0,,537.0
9,Denmark,539,547.0,554.0,546.0,


In [104]:
json_data = df.to_json(orient='records')

# Convert JSON to string
json_string = str(json_data)

print(json_string)

[{"Country":"Australia","2021":540,"2016":544.0,"2011":527.0,"2006":null,"2001":null},{"Country":"Austria","2021":530,"2016":541.0,"2011":529.0,"2006":538.0,"2001":null},{"Country":"Azerbaijan","2021":440,"2016":472.0,"2011":462.0,"2006":null,"2001":null},{"Country":"Belgium (Flemish)","2021":511,"2016":525.0,"2011":null,"2006":547.0,"2001":null},{"Country":"Belgium (French)","2021":494,"2016":497.0,"2011":506.0,"2006":500.0,"2001":null},{"Country":"Bulgaria","2021":540,"2016":552.0,"2011":532.0,"2006":547.0,"2001":550.0},{"Country":"Chinese Taipei","2021":544,"2016":559.0,"2011":553.0,"2006":535.0,"2001":null},{"Country":"Cyprus","2021":511,"2016":null,"2011":null,"2006":null,"2001":494.0},{"Country":"Czech Republic","2021":540,"2016":543.0,"2011":545.0,"2006":null,"2001":537.0},{"Country":"Denmark","2021":539,"2016":547.0,"2011":554.0,"2006":546.0,"2001":null},{"Country":"Egypt","2021":378,"2016":330.0,"2011":null,"2006":null,"2001":null},{"Country":"England","2021":558,"2016":559.0,