In [None]:
# RAG
    # Document Loaders
        # text, pdf, csv, json, WebBase Loader
        # Directory Loader (Multiple Files)

    # Text Splitter

In [17]:
from langchain_community.document_loaders import TextLoader

# Make sure to upload your sample.txt file to the /content/ directory
loader = TextLoader("/content/sample.txt")
docs = loader.load()

print(docs[0].page_content)  # Show first document's content

RuntimeError: Error loading /content/sample.txt

In [2]:
# Text Loader (Is restricted in Zensar system) / CSV Loader Example

In [4]:
pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<2.0.0,>=0.3.75 (from langchain_community)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests<3,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain_community)
  Downloading mypy_extensions-1.1.0-py3-none-any.w

In [18]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path="/content/california_housing_test.csv")  # Upload your .csv
docs = loader.load()

print(type(loader))
print(type(docs))
print(loader)

print("📄 First Row Content:\n", docs[10].page_content)
print("🗂️ Metadata:\n", docs[6].metadata)

<class 'langchain_community.document_loaders.csv_loader.CSVLoader'>
<class 'list'>
<langchain_community.document_loaders.csv_loader.CSVLoader object at 0x780355c492b0>
📄 First Row Content:
 longitude: -118.240000
latitude: 33.980000
housing_median_age: 45.000000
total_rooms: 972.000000
total_bedrooms: 249.000000
population: 1288.000000
households: 261.000000
median_income: 2.205400
median_house_value: 125000.000000
🗂️ Metadata:
 {'source': '/content/california_housing_test.csv', 'row': 6}


In [15]:
# PDF Loader

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/example.pdf")  # Upload your .pdf
pages = loader.load()

print("📄 Page 1 Content:\n", pages[0].page_content[:500])
print("🗂️ Metadata:\n", pages[0].metadata)

In [None]:
# 1 Create an External Knowledge Base
# Doc Loader - but have challenges in uploading text/pdf etc (plain text ext knowledge is generated on the go)

# 2 Integrate LLM -

      # 1. TextSplitter
      # 2. Embeddings
      # 3. Vector Store / Vector DB


# Task
Integrate a plain text knowledge base with an OpenAI LLM using Langchain and Colab's Userdata for API key management.

## Install necessary libraries

### Subtask:
Install the `openai` and `langchain` libraries.


**Reasoning**:
Install the necessary libraries for the task.



In [19]:
pip install openai langchain



## Set up openai api key

### Subtask:
Get your OpenAI API key and store it securely using Colab's Userdata.


**Reasoning**:
Access Colab's Userdata to get the OpenAI API key and store it in a variable.



In [3]:
import os
from google.colab import userdata

# Retrieve the OpenAI API key from Colab's Userdata
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set the environment variable for Langchain
os.environ["OPENAI_API_KEY"] = openai_api_key

print("OpenAI API key loaded successfully.")

OpenAI API key loaded successfully.


## Define the plain text knowledge base

### Subtask:
Create a Python variable containing the plain text you want to use as your knowledge base.


**Reasoning**:
The subtask is to create a Python variable containing the plain text knowledge base. I will define a multiline string variable and populate it with some sample text.



In [4]:
knowledge_base_text = """
This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured.

Machine learning is a subset of artificial intelligence (AI) that focuses on building systems that learn from data.
Common machine learning algorithms include linear regression, logistic regression, decision trees, random forests, and support vector machines.

Data visualization is the graphical representation of data. It helps in understanding trends, patterns, and outliers in the data.
Popular data visualization libraries in Python include Matplotlib, Seaborn, and Plotly.

Pandas is a powerful open-source data analysis and manipulation tool, built on top of the Python programming language.
NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.

Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.
"""

print(knowledge_base_text[:200]) # Print the first 200 characters to verify


This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from d


## Split the text

### Subtask:
Use a text splitter to break down the plain text into smaller chunks.


**Reasoning**:
Import the necessary text splitter class and instantiate it to split the `knowledge_base_text` into chunks.



In [5]:
from langchain.text_splitter import CharacterTextSplitter

# Instantiate the text splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",  # Split by double newline
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into documents
texts = text_splitter.create_documents([knowledge_base_text])

# Print the number of chunks and the content of the first chunk to verify
print(f"Number of chunks: {len(texts)}")
print("First chunk content:\n", texts[0].page_content)
print("Second chunk content:\n", texts[1].page_content)

Number of chunks: 2
First chunk content:
 This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured.

Machine learning is a subset of artificial intelligence (AI) that focuses on building systems that learn from data.
Common machine learning algorithms include linear regression, logistic regression, decision trees, random forests, and support vector machines.

Data visualization is the graphical representation of data. It helps in understanding trends, patterns, and outliers in the data.
Popular data visualization libraries in Python include Matplotlib, Seaborn, and Plotly.
Second chunk content:
 Pandas is a powerful open-source data analysis and manipulation tool, built on top of the Python programming language.
NumPy is a library for the Python programming language, adding support for large,

## Create embeddings

### Subtask:
Use an OpenAI embedding model to create vector representations of the text chunks.


**Reasoning**:
Import the necessary class and create the embeddings for the text chunks.



In [6]:
from langchain_openai import OpenAIEmbeddings

# Instantiate OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create embeddings for the text chunks
text_embeddings = embeddings.embed_documents([t.page_content for t in texts])

# Print the number of embeddings to verify
print(f"Number of embeddings created: {len(text_embeddings)}")

Number of embeddings created: 2


**Reasoning**:
The previous command failed because the `langchain_openai` module was not found. I need to install the necessary package to resolve this issue.



In [24]:
pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.3.32-py3-none-any.whl.metadata (2.4 kB)
Downloading langchain_openai-0.3.32-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.32


**Reasoning**:
Now that `langchain-openai` is installed, I can proceed with importing `OpenAIEmbeddings` and creating the embeddings as per the instructions.



In [16]:
from langchain_openai import OpenAIEmbeddings

# Instantiate OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create embeddings for the text chunks
text_embeddings = embeddings.embed_documents([t.page_content for t in texts])

# Print the number of embeddings to verify
print(f"Number of embeddings created: {len(text_embeddings)}")

Number of embeddings created: 2


## Create a vector store

### Subtask:
Store the text chunks and their embeddings in a vector store for efficient retrieval.


**Reasoning**:
Import the Chroma vector store and create an instance using the text chunks and embeddings.



In [None]:
# Vector Store = Embeddings (FAISS)
# Vector DB -> Vector Store + DB Features

In [7]:
from langchain_community.vectorstores import Chroma # Pine-Cone

# Instantiate the vector store
vectorstore = Chroma.from_documents(texts, embeddings)

# Print a confirmation message
print("Vector store created successfully.")

Vector store created successfully.


**Reasoning**:
The previous command failed because the `chromadb` library is not installed. Install the required library.



In [27]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [

**Reasoning**:
Now that `chromadb` is installed, attempt to create the Chroma vector store again.



In [1]:
from langchain_community.vectorstores import Chroma

# Instantiate the vector store
vectorstore = Chroma.from_documents(texts, embeddings)

# Print a confirmation message
print("Vector store created successfully.")

NameError: name 'texts' is not defined

**Reasoning**:
The previous command was interrupted by a KeyboardInterrupt. Re-run the command to create the Chroma vector store.



In [1]:
from langchain_community.vectorstores import Chroma

# Instantiate the vector store
vectorstore = Chroma.from_documents(texts, embeddings)

# Print a confirmation message
print("Vector store created successfully.")

NameError: name 'texts' is not defined

**Reasoning**:
The variable `texts` is not defined. This is likely due to a notebook state issue. Re-execute the code cell that defines `texts`.



In [2]:
from langchain.text_splitter import CharacterTextSplitter

# Instantiate the text splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",  # Split by double newline
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into documents
texts = text_splitter.create_documents([knowledge_base_text])

# Print the number of chunks and the content of the first chunk to verify
print(f"Number of chunks: {len(texts)}")
print("First chunk content:\n", texts[0].page_content)

NameError: name 'knowledge_base_text' is not defined

**Reasoning**:
The variable `knowledge_base_text` is not defined. This is likely due to a notebook state issue. Re-execute the code cell that defines `knowledge_base_text`.



In [3]:
knowledge_base_text = """
This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured.

Machine learning is a subset of artificial intelligence (AI) that focuses on building systems that learn from data.
Common machine learning algorithms include linear regression, logistic regression, decision trees, random forests, and support vector machines.

Data visualization is the graphical representation of data. It helps in understanding trends, patterns, and outliers in the data.
Popular data visualization libraries in Python include Matplotlib, Seaborn, and Plotly.

Pandas is a powerful open-source data analysis and manipulation tool, built on top of the Python programming language.
NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.

Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.
"""

print(knowledge_base_text[:200]) # Print the first 200 characters to verify


This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from d


**Reasoning**:
Now that `knowledge_base_text` is defined, re-execute the code cell to define the `texts` variable by splitting the text.



In [4]:
from langchain.text_splitter import CharacterTextSplitter

# Instantiate the text splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",  # Split by double newline
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# Split the text into documents
texts = text_splitter.create_documents([knowledge_base_text])

# Print the number of chunks and the content of the first chunk to verify
print(f"Number of chunks: {len(texts)}")
print("First chunk content:\n", texts[0].page_content)

Number of chunks: 2
First chunk content:
 This is a sample knowledge base about data science.
Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured.

Machine learning is a subset of artificial intelligence (AI) that focuses on building systems that learn from data.
Common machine learning algorithms include linear regression, logistic regression, decision trees, random forests, and support vector machines.

Data visualization is the graphical representation of data. It helps in understanding trends, patterns, and outliers in the data.
Popular data visualization libraries in Python include Matplotlib, Seaborn, and Plotly.


**Reasoning**:
Now that both `knowledge_base_text` and `texts` are defined, re-execute the code cell to create the Chroma vector store.



In [5]:
from langchain_community.vectorstores import Chroma

# Instantiate the vector store
vectorstore = Chroma.from_documents(texts, embeddings)

# Print a confirmation message
print("Vector store created successfully.")

NameError: name 'embeddings' is not defined

## Create embeddings retry 1

### Subtask:
Use an OpenAI embedding model to create vector representations of the text chunks.


**Reasoning**:
Instantiate the OpenAIEmbeddings class and create embeddings for the text chunks.



In [6]:
from langchain_openai import OpenAIEmbeddings

# Instantiate OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create embeddings for the text chunks
text_embeddings = embeddings.embed_documents([t.page_content for t in texts])

# Print the number of embeddings to verify
print(f"Number of embeddings created: {len(text_embeddings)}")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

**Reasoning**:
The error indicates that the OPENAI_API_KEY environment variable is not set. I need to set it again before instantiating the OpenAIEmbeddings.



In [7]:
import os
from google.colab import userdata

# Retrieve the OpenAI API key from Colab's Userdata
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set the environment variable for Langchain
os.environ["OPENAI_API_KEY"] = openai_api_key

from langchain_openai import OpenAIEmbeddings

# Instantiate OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create embeddings for the text chunks
text_embeddings = embeddings.embed_documents([t.page_content for t in texts])

# Print the number of embeddings to verify
print(f"Number of embeddings created: {len(text_embeddings)}")

Number of embeddings created: 2


## Create a vector store

### Subtask:
Store the text chunks and their embeddings in a vector store for efficient retrieval.


**Reasoning**:
Import the Chroma class and instantiate it using the from_documents class method with the texts and embeddings.



In [8]:
from langchain_community.vectorstores import Chroma

# Instantiate the vector store
vectorstore = Chroma.from_documents(texts, embeddings)

# Print a confirmation message
print("Vector store created successfully.")

Vector store created successfully.


## Set up a retrieval chain

### Subtask:
Create a Langchain retrieval chain that uses the vector store and an OpenAI LLM to answer questions based on the text.


**Reasoning**:
The goal is to create a Langchain retrieval chain. This involves importing necessary classes and functions, defining a prompt, creating a document stuffing chain, and finally creating the retrieval chain.



In [8]:
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain # Corrected import
from langchain.chains.retrieval import create_retrieval_chain # Corrected import
from langchain_core.prompts import ChatPromptTemplate

# 2. Instantiate a ChatOpenAI model
llm = ChatOpenAI(model="gpt-4o-mini")

# 4. Define a prompt for the retrieval chain
prompt = ChatPromptTemplate.from_template("""Answer the user's question based on the following context:

{context}

Question: {input}

If you cannot find the answer in the context, please state that you do not know.""")

# 5. Create a chain to stuff the documents into the prompt
stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

# 6. Create the retrieval chain
retrieval_chain = create_retrieval_chain(vectorstore.as_retriever(), stuff_documents_chain)

print("Retrieval chain created successfully.")

ImportError: cannot import name 'create_stuff_documents_chain' from 'langchain.chains' (/usr/local/lib/python3.12/dist-packages/langchain/chains/__init__.py)

**Reasoning**:
The previous code failed because `create_stuff_documents_chain` and `create_retrieval_chain` are not in `langchain.chains`. They should be imported from `langchain.chains.combine_documents` and `langchain.chains.retrieval` respectively. I will fix the imports and rerun the code to create the retrieval chain.



In [9]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

# 2. Instantiate a ChatOpenAI model
llm = ChatOpenAI(model="gpt-4o-mini")

# 4. Define a prompt for the retrieval chain
prompt = ChatPromptTemplate.from_template("""Answer the user's question based on the following context:

{context}

Question: {input}

If you cannot find the answer in the context, please state that you do not know.""")

# 5. Create a chain to stuff the documents into the prompt
stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

# 6. Create the retrieval chain
retrieval_chain = create_retrieval_chain(vectorstore.as_retriever(), stuff_documents_chain)

print("Retrieval chain created successfully.")

Retrieval chain created successfully.


## Query the llm

### Subtask:
Ask a question to the retrieval chain and get an answer based on the plain text knowledge base.


**Reasoning**:
Define a question and invoke the retrieval chain to get an answer based on the knowledge base.



In [11]:
# Define the question
question = "What is Gen-AI?"

# Invoke the retrieval chain
response = retrieval_chain.invoke({"input": question})

# Print the answer
print(response["answer"])

I do not know.


In [12]:
print("Enter your questions about the knowledge base. Type 'exit' or 'quit' to end.")

while True:
    question = input("You: ")
    if question.lower() in ["exit", "quit"]:
        break

    try:
        response = retrieval_chain.invoke({"input": question})
        print("Bot:", response["answer"])
    except Exception as e:
        print(f"An error occurred: {e}")

print("Exiting conversation.")

Enter your questions about the knowledge base. Type 'exit' or 'quit' to end.
You: what is Data Science
Bot: Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from data in various forms, both structured and unstructured.
You: What is Python
Bot: The provided context does not explicitly define Python. However, it is implied as a programming language that serves as the foundation for tools such as Pandas and NumPy, which are used for data analysis and manipulation. If you need a specific definition of Python, I do not know.
You: how cricket is played
Bot: I do not know.
You: how gen is studied
Bot: I do not know.
You: Data Science
Bot: Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from data in various forms, both structured and unstructured.
You: What is NLP
Bot: NLP, or Natural Language Processing, is a

In [13]:
pip install gradio



In [14]:
def chatbot_response(question):
    """
    Gets a response from the retrieval chain based on the user's question.
    """
    try:
        response = retrieval_chain.invoke({"input": question})
        return response["answer"]
    except Exception as e:
        return f"An error occurred: {e}"

In [17]:
import gradio as gr

# Create the Gradio interface
iface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(label="Your Question"),
    outputs=gr.Textbox(label="Chatbot Response"),
    title="Knowledge Base Chatbot",
    description="Ask questions about the data science knowledge base."
)

# Launch the interface
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a53069843ec9c1e82d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [18]:
# End of the Notebook

## Summary:

### Data Analysis Key Findings

*   The necessary libraries (`openai`, `langchain`, `langchain-openai`, `chromadb`) were installed to facilitate the integration.
*   The OpenAI API key was successfully retrieved from Colab's Userdata and set as an environment variable, which is crucial for authenticating with OpenAI services.
*   A plain text knowledge base about data science was defined as a Python variable.
*   The knowledge base text was successfully split into 2 smaller chunks using `CharacterTextSplitter`.
*   Embeddings for the text chunks were successfully generated using `OpenAIEmbeddings`.
*   The text chunks and their embeddings were stored in a `Chroma` vector store for efficient retrieval.
*   A Langchain retrieval chain was successfully set up using a `ChatOpenAI` model, a prompt template, and the created vector store.
*   Querying the retrieval chain with the question "What is data science?" successfully returned an answer based on the provided knowledge base.

### Insights or Next Steps

*   The process demonstrates a successful integration of a plain text knowledge base with an OpenAI LLM using Langchain for question answering.
*   Future steps could involve expanding the knowledge base with more comprehensive information and evaluating the retrieval chain's performance on a wider range of questions.
