In [1]:
## fitz gave me an error
#!pip install --force-reinstall PyMuPDF

In [2]:
import requests
import fitz  # PyMuPDF
from io import BytesIO
import os
from typing import List
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma

import chromadb
from chromadb.utils import embedding_functions

import openai


In [3]:
###Init clients
"""
Initialize OpenAI API and Langsmith
"""
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

print(f"OpenAI API key initialized: {openai.api_key[0:10]}...")

# Create OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    openai.api_key,
    model_name="text-embedding-3-small"
)

OpenAI API key initialized: sk-proj-FP...


### Load and split document
#### I'll use product manual about Samsung s25

In [4]:
##Modulo 1
def read_pdf_from_url(url):
    """
    Read PDF from URL and extract raw text using PyMuPDF.
    
    Args:
        url (str): URL of the PDF file
        
    Returns:
        str: Extracted text from the PDF
    """
    try:
        # Download PDF from URL
        response = requests.get(url)
        response.raise_for_status()
        
        # Open PDF from bytes
        pdf_bytes = BytesIO(response.content)
        pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
        
        # Extract text from all pages
        text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text += page.get_text()
        
        pdf_document.close()
        return text
        
    except Exception as e:
        print(f"Error reading PDF from URL: {e}")
        return None

In [5]:
# Example usage:
url = "https://downloadcenter.samsung.com/content/UM/202505/20250523185456235/SM-S93X_UG_CA_15_Eng_D05_250523.pdf"
text = read_pdf_from_url(url)
print(text)

www.samsung.com/ca
English (CA). 05/2025. Rev.2.0
SM-S931W
SM-S936W
SM-S937W
SM-S938W
USER GUIDE
2
Table of Contents
Getting started
5	
Device layout and functions
13	
Charging the battery
18	
Nano-SIM card and eSIM
21	
Turning the device on and off
22	
Initial setup
22	
Using networks
22	
Samsung account
23	
Transferring data from your previous 
device (Smart Switch)
24	
Understanding the screen
31	
Notification panel
32	
Quick settings panel
34	 Screen capture and screen record
36	
Entering text
38	 Extracting text
Apps and features
39	
Installing or uninstalling apps
40	 Galaxy AI
48	 S Pen (Galaxy S25 Ultra)
55	
Phone
59	
Contacts
62	
Camera
80	 Gallery
85	 Bixby
86	 Bixby Vision
87	
Multi window (Using multiple apps 
at once)
88	 Samsung Internet
89	 Samsung Wallet
93	
Samsung Health
93	
Samsung Notes
97	
Samsung Members
97	
Samsung Kids
98	 Samsung Global Goals
98	 Samsung TV Plus
98	 Samsung Find
98	 Samsung Shop
99	
Galaxy Wearable
99	
PENUP (Galaxy S25 Ultra)
99	
Calendar
100	

In [6]:
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """
    Split text into chunks using RecursiveCharacterTextSplitter.
    
    Args:
        text (str): Input text to split
        chunk_size (int): Maximum size of each chunk
        chunk_overlap (int): Number of characters to overlap between chunks
        
    Returns:
        list: List of text chunks
    """
    if not text:
        return []
    
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    # Split the text
    chunks = text_splitter.split_text(text)
    
    return chunks

In [7]:
# Example usage with the Samsung manual
url = "https://downloadcenter.samsung.com/content/UM/202505/20250523185456235/SM-S93X_UG_CA_15_Eng_D05_250523.pdf"
text = read_pdf_from_url(url)
if text:
    chunks = chunk_text(text)
    print(f"Total chunks created: {len(chunks)}")
    print(f"First chunk preview: {chunks[0][:100]}...")
    print(f"Chunk sizes: {[len(chunk) for chunk in chunks[:5]]}")  # Show first 5 chunk sizes

Total chunks created: 325
First chunk preview: www.samsung.com/ca
English (CA). 05/2025. Rev.2.0
SM-S931W
SM-S936W
SM-S937W
SM-S938W
USER GUIDE
2
T...
Chunk sizes: [995, 998, 992, 995, 951]


### 2. Generate Embeddings and Store in ChromaDB

In [8]:
def create_or_get_vector_db(chunks: List[str], collection_name: str, 
                    persist_directory: str = "./chroma_db") -> chromadb.Collection:
    """
    Create or load a ChromaDB collection with OpenAI embeddings.
    
    Args:
        chunks (List[str]): List of text chunks to embed
        collection_name (str): Name of the ChromaDB collection
        persist_directory (str): Directory to persist the database
        
    Returns:
        chromadb.Collection: The ChromaDB collection
    """
    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path=persist_directory)
    
    # Get or create collection
    try:
        collection = client.get_collection(name=collection_name, embedding_function=openai_ef)
        if collection:
            print(F"Collection founded! it has {collection.count()} documents")
            return collection

    
    except Exception as e:
        print("Collection not founded; creating it...")
        # Collection doesn't exist, create new one
        collection = client.create_collection(
            name=collection_name,
            embedding_function=openai_ef,
            metadata={"description": "Product manual embeddings with OpenAI"}
        )
        print(f"Created new collection '{collection_name}'")
    
        if chunks:
            # Generate embeddings using OpenAI
            print(f"Embedding {len(chunks)} chunks...")
            
            # Prepare documents and metadata
            documents = chunks
            ids = [f"chunk_{i}" for i in range(len(chunks))]
            metadatas = [{"chunk_index": i, "chunk_length": len(chunk)} for i, chunk in enumerate(chunks)]
            
            # Add documents to collection (ChromaDB will handle embeddings)
            collection.add(
                documents=documents,
                ids=ids,
                metadatas=metadatas
            )
            
            print(f"Successfully embedded {len(chunks)} chunks into ChromaDB")
    
        return collection

In [9]:
samsung_s25_coll = create_or_get_vector_db(chunks=chunks, collection_name="samsung-s25")

Collection founded! it has 325 documents


### 3 Basic RAG CHain

In [10]:
url = "https://downloadcenter.samsung.com/content/UM/202505/20250523185456235/SM-S93X_UG_CA_15_Eng_D05_250523.pdf"
collection_name = "samsung_s25_coll"
question="How to take a screenshot?"

#Create chunks
chunks = chunk_text(read_pdf_from_url(url=url))
# Create or get collection
collection = create_or_get_vector_db(chunks=chunks, collection_name=collection_name)
#Retrieve conxext
context = collection.query(query_texts=[question], n_results=3)
docs = context['documents'][0]

# Initialize LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)
# Create prompt template

prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant that answers questions based on the provided context.
Use only the information from the context to answer the question.
If the answer is not in the context, say "I don't have enough information to answer this question."
Context:
{context}
           
Question: {question}
           
Answer:
""")

rag_chain = (
    RunnableLambda(lambda x: {
        "context": "\n\n".join(docs),
        "question": question
    })
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke({"question":question})

Collection founded! it has 325 documents


'There are two methods to capture a screenshot. The first method is the button capture where you press the Side button and the Volume Down button simultaneously. The second method is the swipe capture where you swipe your hand to the left or right across the screen with the edge of your hand. If capturing a screenshot by swiping is not turned on, you can open Settings, tap Advanced features → Motions and gestures, and then tap the Palm swipe to capture switch to turn it on.'

### Q&A Chatbot on a python library


In [11]:
request_url = "https://app.readthedocs.org/projects/requests/downloads/pdf/latest/"

#1. Read PDF
text = read_pdf_from_url(url=request_url)
#2. create chunks
chunks = chunk_text(text=text)
#3. Create collection
collection = create_or_get_vector_db(chunks=chunks, collection_name='request')

#4. Create simple retriever function
def retrieve_context(query: str, n_results: int = 3):
    """Retrieve relevant context from ChromaDB collection"""
    results = collection.query(query_texts=[query], n_results=n_results)
    return results['documents'][0]

#5. Create prompt and LLM
prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant that answers questions based on the provided context.
Use only the information from the context to answer the question.
If the answer is not in the context, say "I don't have enough information to answer this question."
Context:
{context}
           
Question: {question}
           
Answer:
""")

llm = ChatOpenAI(temperature=0, model='gpt-4')

# Create RAG chain
context_docs = retrieve_context(question)
chain = (
    RunnableLambda(lambda x: {
        "context": "\n\n".join(retrieve_context(question)),
        "question": question
    })
    | prompt
    | llm
    | StrOutputParser()
)


Collection founded! it has 242 documents


In [12]:
# Test the RAG chain
question = 'How do i create a get request?'
response = chain.invoke({"question": question})
print(response)

To create a GET request, you can use the following steps:
1. Import the requests module in Python using the command: `import requests`
2. Use the `requests.get()` function with the URL as the argument. For example: `r = requests.get('https://api.github.com/events')`
This will return a Response object which you can use to get all the information you need.
