In [17]:
!rm -rf ~/.cache/pip

In [3]:
!pip install setuptools==70.0.0

Collecting setuptools==70.0.0
  Using cached setuptools-70.0.0-py3-none-any.whl.metadata (5.9 kB)
Using cached setuptools-70.0.0-py3-none-any.whl (863 kB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 72.1.0
    Uninstalling setuptools-72.1.0:
      Successfully uninstalled setuptools-72.1.0
Successfully installed setuptools-70.0.0


In [1]:
!pip install crewai -r ../requirements.txt



In [2]:
import dotenv
assert dotenv.load_dotenv()

In [3]:
# Import required libraries
import os
from langchain_aws import ChatBedrock

# Set up the model ID for Claude
MODEL_ID3 = "meta.llama3-8b-instruct-v1:0"
#MODEL_ID = "meta.llama3-70b-instruct-v1:0"
#MODEL_ID = "mistral.mistral-7b-instruct-v0:2"
#MODEL_ID = "mistral.mixtral-8x7b-instruct-v0:1"
MODEL_ID2 = "anthropic.claude-3-haiku-20240307-v1:0"
MODEL_ID = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# Initialize the ChatBedrock instance
llm = ChatBedrock(model_id=MODEL_ID, model_kwargs={'temperature': 0})
llm2 = ChatBedrock(model_id=MODEL_ID2, model_kwargs={'temperature': 0})
llm3 = ChatBedrock(model_id=MODEL_ID3, model_kwargs={'temperature': 0})
llm4 = ChatBedrock(model_id=MODEL_ID, model_kwargs={'temperature': 0.7})

In [4]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Instantiate Hugging Face embeddings
hf_embeddings = hf

In [16]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# URL of the PDF document
pdf_url = "https://pirls2021.org/wp-content/uploads/2023/05/P21_MP_Ch3-sample-design.pdf"  # Replace with actual URL

# Load the PDF from the URL
loader = PyPDFLoader(file_path=pdf_url)
pdf_docs = loader.load()

# Add metadata (source) to each document
for doc in pdf_docs:
    doc.metadata["source"] = pdf_url  # Add URL as the source

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)
splits = text_splitter.split_documents(pdf_docs)

# Create the vectorstore using Hugging Face embeddings and IDs
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=hf_embeddings)

retriever = vectorstore.as_retriever()

In [7]:
import nltk
print(nltk.__version__)

3.9.1


In [8]:
import nltk

nltk.download('punkt')
nltk.download('punkt', download_dir='/home/ec2-user/nltk_data')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [9]:
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Path to the Excel file
excel_path = "https://pirls2021.org/wp-content/uploads/2022/files/1_1-2_achievement-results-1.xlsx"  # Replace with your actual file path

# Load the Excel file
loader = UnstructuredExcelLoader(file_path=excel_path, mode="elements")
excel_docs = loader.load()

# Function to clean metadata
def clean_metadata(metadata):
    for key, value in metadata.items():
        if isinstance(value, list):
            metadata[key] = ', '.join(value)  # Convert list to comma-separated string
    return metadata

# Clean metadata for each document
for doc in excel_docs:
    doc.metadata = clean_metadata(doc.metadata)

# Split the document into chunks if needed
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)
splits = text_splitter.split_documents(excel_docs)

# Create the vectorstore using Hugging Face embeddings and IDs
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=hf_embeddings)

retriever = vectorstore.as_retriever()

In [11]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader


target_url = "https://pirls2021.org/results/international-benchmarks/percentages"
loader = WebBaseLoader(
    web_paths=(target_url,),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("p")
    ),
)
blog_docs = loader.load()

# Add metadata for web source (URL)
for doc in blog_docs:
    doc.metadata["source"] = target_url

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)



# Make splits
splits = text_splitter.split_documents(blog_docs)

# Create the vectorstore using Hugging Face embeddings and IDs
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=hf_embeddings)

retriever = vectorstore.as_retriever()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [17]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate three 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
# from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | llm 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [18]:
from pydantic import BaseModel

class Config:
    arbitrary_types_allowed = True

In [20]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "Who are reading scores weighted in PIRLS 2021?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

14

In [23]:
from operator import itemgetter
# from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}

The answer to the user's question in markdown format. 
Each paragraph starts with an emoji and a heading in capital letters. 
Use bullet points (starting with emoji) and tables to increase readability. 
Cite explicitly and cite your sources with a link. The hyperlink should be directly added to the quote and a footnote added at the end. 
Also refer to sources (links) that are mentioned in relation to your citations.
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'📊 SAMPLING WEIGHTS IN PIRLS 2021\n\n🔢 OVERVIEW OF WEIGHTING PROCESS\n- Student reading scores in PIRLS 2021 are weighted using a complex sampling design to ensure representative results\n- Weights account for selection probabilities and non-response at multiple levels:\n  • School level\n  • Class level \n  • Student level\n\n🏫 SCHOOL WEIGHTING\n- Schools are sampled with probability proportional to size\n- Basic school weight is the inverse of selection probability\n- Adjusted for non-participating schools\n\n📚 CLASS WEIGHTING  \n- Classes within sampled schools are randomly selected\n- Class weights account for number of classes selected per school\n- Adjusted for non-participating classes\n\n👨\u200d🎓 STUDENT WEIGHTING\n- All students in selected classes are included\n- Student weights account for student non-response within classes\n\n🧮 FINAL WEIGHT CALCULATION\n- Overall student weight is the product of school, class, and student weight components\n- As stated in the source: "The 