# LOADING PDF FILES

This section demonstrates how to load PDF files using the `PyPDFLoader` class from the `langchain_community.document_loaders` module.



In [1]:
# loading a pdf file
# Import library
from langchain_community.document_loaders import PyPDFLoader
# Create a document loader for rag_paper.pdf
loader = PyPDFLoader('rag_paper.pdf')

# Load the document
data = loader.load()
# Load the document
data = loader.load()
print(data[0:4])

[Document(metadata={'source': 'rag_paper.pdf', 'page': 0, 'page_label': '1'}, page_content='STAPHYLOCOCCI\nEUL Faculty of Medicine 2024-2025 Fall\nAssist. Prof. Dr. Emrah Güler'), Document(metadata={'source': 'rag_paper.pdf', 'page': 1, 'page_label': '2'}, page_content='General characteristics\n\uf0a8 Gram-positive spherical cells arranged in \nirregular clusters\n\uf0a4staphylé: a bunch of grapes\n\uf0a8 May also appear as single cells, pairs or short\nchains\n'), Document(metadata={'source': 'rag_paper.pdf', 'page': 2, 'page_label': '3'}, page_content='General characteristics\n\uf0a8 Nonmotile\n\uf0a8 Facultatively anaerobic\n\uf0a8 Do not form spores\n\uf0a8 Produce catalase \n\uf0a8 Resistant to drying, heat and high concentration of salt \n\uf0a4 Can grow in media containing 10% of NaCl\n\uf0a4 Can grow at temperature of 18oC-40oC\n\uf0a8 Produce pigments that vary from white to yellow\n\uf0a8 Slowly ferment many carbohydrates; produce lactic acid\n\uf0a8 Some are members of the n

# LOADING HTML FILES

This section demonstrates how to load HTML files using the `UnstructuredHTMLLoader` class from the `langchain_community.document_loaders` module.

In [4]:
#loading  html file
from langchain_community.document_loaders import UnstructuredHTMLLoader
import nltk


# Load the HTML file
loader = UnstructuredHTMLLoader('cnn-live-news.html')

# Check the loader object to confirm it's working
print(f"Loader object: {loader}")

# Load data from the HTML file
data = loader.load()

# Print out the data to verify it's processed correctly
print(data)


Loader object: <langchain_community.document_loaders.html.UnstructuredHTMLLoader object at 0x00000242941E9C70>


# LOADING MARKDOWN FILES

This section demonstrates how to load markdown files using the `UnstructuredMarkdownLoader` class from the `langchain_community.document_loaders` module.

In [1]:
# Import the necessary module
from langchain_community.document_loaders import UnstructuredMarkdownLoader

# Load the Markdown file
loader = UnstructuredMarkdownLoader('valendata.md')

# Check the loader object to confirm it's working
print(f"Loader object: {loader}")

# Load data from the Markdown file
data = loader.load()

# Print out the data to verify it's processed correctly
print(data)

Loader object: <langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader object at 0x000002846673B830>


# TEXT SPLITTING USING  CharacterTextSplitter

In [2]:
#splitting the text
from langchain.text_splitter import CharacterTextSplitter

# Sample text
text = '''RAG (retrieval augmented generation) is an advanced NLP model that combines retrieval mechanisms with generative capabilities. RAG aims to improve the accuracy and relevance of its outputs by grounding responses in precise, contextually appropriate data.'''

# Define the text splitter with the specified parameters
text_splitter = CharacterTextSplitter(
    separator=".",  # Split based on period
    chunk_size=75,  # Set the chunk size to 75 characters
    chunk_overlap=10  # Set the chunk overlap to 10 characters
)

# Split the text using the defined splitter
chunks = text_splitter.split_text(text)

# Print the resulting chunks and their lengths
print(chunks)
print([len(chunk) for chunk in chunks])


Created a chunk of size 125, which is longer than the specified 75


['RAG (retrieval augmented generation) is an advanced NLP model that combines retrieval mechanisms with generative capabilities', 'RAG aims to improve the accuracy and relevance of its outputs by grounding responses in precise, contextually appropriate data']
[125, 126]


# SPLITTING PDF USING RecursiveCharacterTextSplitter

In [11]:
#splitting pdf
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader

# Sample text
loader = UnstructuredHTMLLoader('cnn-live-news.html')
data = loader.load()


# Define the text splitter with the specified parameters
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '.', ' ', ''],  # Split based on period
    chunk_size=75,  # Set the chunk size to 75 characters
    chunk_overlap=10  # Set the chunk overlap to 10 characters
)

# Split the text using the defined splitter
chunks = text_splitter.split_documents(data)

# Print the resulting chunks and their lengths
print(chunks)
for chunk in chunks:
    print(chunk)
    print(len(chunk.page_content))


page_content='Thank You!' metadata={'source': 'cnn-live-news.html'}
10
page_content='Your effort and contribution in providing this feedback is much' metadata={'source': 'cnn-live-news.html'}
63
page_content='is much appreciated' metadata={'source': 'cnn-live-news.html'}
19
page_content='.' metadata={'source': 'cnn-live-news.html'}
1
page_content='Close

Ad Feedback

Live Updates The opening days of Trump’s presidency' metadata={'source': 'cnn-live-news.html'}
71
page_content='Live Updates Latest round of hostage releases

Live Updates' metadata={'source': 'cnn-live-news.html'}
59
page_content='Hamas frees four Israeli female soldiers in latest round of hostage' metadata={'source': 'cnn-live-news.html'}
67
page_content='hostage releases' metadata={'source': 'cnn-live-news.html'}
16
page_content='By Helen Regan , Kareem El Damanhoury , James Legge , Sophie Tanno and' metadata={'source': 'cnn-live-news.html'}
70
page_content='Tanno and Caitlin Danaher , CNN' metadata={'source': 'cnn-live

This is storing using chroma 


In [None]:
#this is storing using chroma 
# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbeddings(api_key="<OPENAI_API_TOKEN>", model='text-embedding-3-small')
# Create a Chroma vector store and embed the chunks
vector_store = Chroma.from_documents(
    documents=chunks,  # List of text chunks
    embedding=embedding_model,  # Embedding model
)

# COMPLETE RAG-CHAIN

This section demonstrates the complete workflow of a Retrieval-Augmented Generation (RAG) chain, including document loading, text splitting, embedding generation, storage, and querying.

1. **Loading Documents**:
    - HTML documents are loaded using `UnstructuredHTMLLoader`.

2. **Text Splitting**:
    - The loaded documents are split into chunks using `RecursiveCharacterTextSplitter`.

3. **Embedding Generation**:
    - Embeddings for the document chunks are generated using `OpenAIEmbeddings`.

4. **Storing Embeddings**:
    - The generated embeddings are stored in Supabase.

5. **Querying Embeddings**:
    - Relevant document chunks are retrieved based on a query using the stored embeddings.

6. **Generating Answers**:
    - Answers are generated using the retrieved document chunks and `ChatOpenAI`.

The following cells in this notebook demonstrate each step of the RAG chain in detail.


# Supabase Connection


In [4]:
from supabase import create_client, Client
from dotenv import load_dotenv
import os

# Load environment variables from a .env file
load_dotenv()

# Supabase connection setup
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)


 # Initialize OpenAI API



In [50]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Replace with your actual API key
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize the OpenAI embeddings model
embedding_model = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-ada-002')

# Initialize the OpenAI chat model
llm = ChatOpenAI(api_key=openai_api_key, model="gpt-3.5-turbo")


# Load and Split Documents


In [51]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_and_split_documents(file_path):
    """
    Load an HTML document from the specified file path and split it into chunks.

    Args:
        file_path (str): The path to the HTML file to be loaded.

    Returns:
        list: A list of document chunks, where each chunk is a string.
    """
    loader = UnstructuredHTMLLoader(file_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n', '.', ' ', ''],
        chunk_size=75,
        chunk_overlap=10
    )
    return text_splitter.split_documents(data)

# Example Usage:
chunks = load_and_split_documents('mytvv-home.html')


# Generate and Store Embeddings


In [52]:
import json

# Function to insert embedding into Supabase
def insert_embedding(document, embedding, metadata={}):
    data = {
        "embedding": embedding,
        "document": document,
        "metadata": json.dumps(metadata)
    }
    response = supabase.table('vector_store').insert(data).execute()
    if response.data:
        print("Insertion successful")
    else:
        print(f"Error inserting: {response.error_message}")

# Function to generate and store embeddings for document chunks
def generate_and_store_embeddings(chunks, source="CNN live news"):
    for chunk in chunks:
        # Generate embedding for each chunk
        embedding = embedding_model.embed_query(chunk.page_content)
        # Insert the embedding into Supabase
        insert_embedding(chunk.page_content, embedding, {"source": source})
        print(f"Inserted document: {chunk.page_content[:50]}...")

# Example Usage:
generate_and_store_embeddings(chunks)


Insertion successful
Inserted document: Henry Danger...
Insertion successful
Inserted document: Genre: Superhero, Comedy, Action Creator: Dana Ols...
Insertion successful
Inserted document: . Israel Original Run: July 26, 2014 - March 21, 2...
Insertion successful
Inserted document: Number of Seasons: 5 Number of Episodes: 121...
Insertion successful
Inserted document: At the beginning of the show, Henry explains how h...
Insertion successful
Inserted document: . He mentions how Captain Man's father, an irrespo...
Insertion successful
Inserted document: accidentally made him indestructible...
Insertion successful
Inserted document: . Henry wanted a simple after-school job and stumb...
Insertion successful
Inserted document: old store, which led him to becoming Captain Man's...
Insertion successful
Inserted document: ....
Insertion successful
Inserted document: Main Characters

Henry Hart (Kid Danger)...
Insertion successful
Inserted document: Played by: Jace Norman Age: 13-18 (througho

# Retrieve Relevant Chunks from Supabase 

In [56]:
def retrieve_relevant_chunks(question, top_k=3):
    """
    Retrieve the most relevant document chunks for a given question.

    This function takes a question as input, generates an embedding for it using
    the embedding model, and then queries a database to find the top_k most relevant
    document chunks that match the question embedding.

    Args:
        question (str): The question for which relevant document chunks are to be retrieved.
        top_k (int, optional): The number of top matching document chunks to retrieve. Defaults to 3.

    Returns:
        list: A list of the top_k most relevant document chunks.
    """
    question_embedding = embedding_model.embed_query(question)
    response = supabase.rpc('match_documents', {
        'query_embedding': question_embedding,
        'match_count': top_k
    }).execute()
    return response.data

# Example Usage:
# relevant_chunks = retrieve_relevant_chunks("TELL ME ABOUT HENRY")


# Generate Answer using RAG

In [61]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableSequence
from langchain_core.output_parsers import StrOutputParser

def generate_answer(question):
    # Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(question)
    # Combine the content of the relevant chunks into a single context string
    context = "\n".join([chunk['document'] for chunk in relevant_chunks])

    # Define the prompt template for generating the answer
    prompt_template = PromptTemplate.from_template(
        "Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    )

    # Create the RAG (Retrieval-Augmented Generation) pipeline
    rag_chain = RunnableSequence(
        {"context": RunnablePassthrough(), "question": RunnablePassthrough()},
        prompt_template,
        llm,
        StrOutputParser()
    )

    # Generate the answer using the RAG pipeline
    answer = rag_chain.invoke({"context": context, "question": question})
    return answer

# Example Usage:
question = " What is RAG?" 
answer = generate_answer(question)
print(f"Answer: {answer}")


Answer: I'm sorry, but without more specific information or context, I am unable to provide an accurate answer to the question "What is RAG?". Please provide more details or clarify the context so I can give you a relevant response.


In [1]:
from graphviz import Digraph
from IPython.display import display

def draw_rag_workflow():
    dot = Digraph("RAG Workflow", format="png")
    
    # Nodes
    dot.node("A", "Load & Split Documents")
    dot.node("B", "Generate & Store Embeddings")
    dot.node("C", "Retrieve Relevant Chunks")
    dot.node("D", "Generate Answer")
    dot.node("E", "LangGraph Integration (Upcoming)")
    
    # Edges
    dot.edge("A", "B")
    dot.edge("B", "C")
    dot.edge("C", "D")
    dot.edge("D", "E")
    
    # Render and display
    dot.render("rag_workflow")
    display(Image("rag_workflow.png"))

draw_rag_workflow()


ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

# SQL RAG FUNCTION

In [3]:
from dotenv import load_dotenv
import os

# Load environment variables from a .env file
load_dotenv()

# Retrieve the OpenAI API key from the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')


In [63]:
from langchain_core.prompts import ChatPromptTemplate

template = """
Based on the table schema below, write a SQL query that would answer the user's question:
{schema}

Question: {question}
SQL Query:
"""

prompt = ChatPromptTemplate.from_template(template)

In [64]:
prompt.format(schema="my schema", question="how many users are there?")


"Human: \nBased on the table schema below, write a SQL query that would answer the user's question:\nmy schema\n\nQuestion: how many users are there?\nSQL Query:\n"

In [65]:
from langchain_community.utilities import SQLDatabase

db_uri = "mysql+mysqlconnector://root:Vondabaic2020@localhost:3306/chinook"
db = SQLDatabase.from_uri(db_uri)

In [66]:
db.run("SELECT* FROM foodstuffs LIMIT 5;")

"[(1, 'Rice', 'Grains', Decimal('45.00'), 20, 'White long-grain rice, 5kg bag', 'https://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500', datetime.datetime(2025, 1, 27, 14, 34, 36)), (2, 'Wheat Flour', 'Grains', Decimal('30.00'), 15, 'High-quality whole wheat flour, 2kg', 'https://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500', datetime.datetime(2025, 1, 27, 14, 34, 36)), (3, 'Bananas', 'Fruits', Decimal('1.20'), 50, 'Fresh ripe bananas, sold per piece', 'https://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500', datetime.datetime(2025, 1, 27, 14, 34, 36)), (4, 'Oranges', 'Fruits', Decimal('1.50'), 40, 'Sweet and juicy oranges, sold per piece', 'https://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500', datetime.datetime(2025, 1, 27, 14, 34, 36)), (5, 'Chicken Breast', 'Meat', Decimal('12.99'), 25, '

In [67]:
def get_schema(_):
    return db.get_table_info()

In [68]:
get_schema(None)

'\nCREATE TABLE foodstuffs (\n\tfood_id INTEGER NOT NULL AUTO_INCREMENT, \n\tfood_name VARCHAR(100) NOT NULL, \n\tcategory VARCHAR(50), \n\tprice DECIMAL(10, 2) NOT NULL, \n\tstock_quantity INTEGER NOT NULL, \n\tdescription TEXT, \n\timage_url VARCHAR(255), \n\tcreated_at TIMESTAMP NULL DEFAULT CURRENT_TIMESTAMP, \n\tPRIMARY KEY (food_id)\n)DEFAULT CHARSET=utf8mb4 COLLATE utf8mb4_0900_ai_ci ENGINE=InnoDB\n\n/*\n3 rows from foodstuffs table:\nfood_id\tfood_name\tcategory\tprice\tstock_quantity\tdescription\timage_url\tcreated_at\n1\tRice\tGrains\t45.00\t20\tWhite long-grain rice, 5kg bag\thttps://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=5\t2025-01-27 14:34:36\n2\tWheat Flour\tGrains\t30.00\t15\tHigh-quality whole wheat flour, 2kg\thttps://images.pexels.com/photos/128756/pexels-photo-128756.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=5\t2025-01-27 14:34:36\n3\tBananas\tFruits\t1.20\t50\tFresh ripe bananas, sold per piece\thttps://images.pexels

In [69]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()
sql_chain = (
    RunnablePassthrough.assign(schema = get_schema)
    | prompt
    | llm.bind(stop ="\nSQL Result:")
    | StrOutputParser()
)


In [70]:
sql_chain.invoke({"question":"how many rice are there"})


"SELECT stock_quantity\nFROM foodstuffs\nWHERE food_name = 'Rice';"

In [88]:
template = """ 
Based on the table schema below, question, sql query, and sql response, write a natural language response and respond like a cashier dont speak too much technical:
{schema}

Question: {question}
SQL Query: {query}
SQL Response: {response}
"""
prompt = ChatPromptTemplate.from_template(template)

In [89]:
def run_query(query):
    return db.run(query)

In [90]:
full_chain = (
    RunnablePassthrough.assign(query=sql_chain).assign(
        schema=get_schema,
        response=lambda variables: run_query(variables["query"])
    )
    | prompt
    | llm.bind(stop="\nNatural Language Response:")
)

In [91]:
full_chain.invoke({"question": "CAN I SEE THE RICE YOU HAVE  ?"})

AIMessage(content='Tablodan baktığım kadarıyla, elimizde beyaz uzun taneli pirinç var. 5 kiloluk paketlerde, fiyatı 45 TL. İsterseniz bakabilirsiniz.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 52, 'prompt_tokens': 517, 'total_tokens': 569, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-2c158971-6e3a-41a4-a943-ec63a3acfa4a-0', usage_metadata={'input_tokens': 517, 'output_tokens': 52, 'total_tokens': 569, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})