In [81]:
import os # to access environment variables
from dotenv import load_dotenv # to load .env file

import openai
import langchain
import pinecone 

# Data loading and text splitting
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# --- Core LangChain Components ---
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
# Embeddings and LLMs
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# Pinecone and LangChain integration
from langchain_pinecone import PineconeVectorStore


In [73]:
# Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [74]:
doc=read_doc('/Users/sivamanipatnala/Downloads/RAG_Chatbot/documents/')
len(doc)

58

In [75]:
## Divide the docs into chunks
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunked_doc=text_splitter.split_documents(docs)
    return chunked_doc

In [76]:
chunked_doc=chunk_data(docs=doc)
len(chunked_doc)
# length is increased from 58 because of chunking

140

In [77]:
""" 
load_dotenv(override=True) 
"""
# changing api key in .env file and saving doesnt override undeless the above command is run

' \nload_dotenv(override=True) \n'

In [78]:
# Embedding Technique Of OPENAI
# Initialize embeddings
load_dotenv()
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1101f9480>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1101fb130>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
# Vector search DB in Pinecone
""" This single step connects to Pinecone, creates embeddings for your documents,
    and stores them in the specified index. Pass chunked data, not raw data. """

index_name = "langchainvector"

vectorstore = PineconeVectorStore.from_documents(
    documents=chunked_doc, 
    embedding=embeddings, 
    index_name=index_name
)

In [82]:
def create_rag_chain(vectorstore):
    # Create rag chain using LCEL (LangChain Expression Language)
    llm = ChatOpenAI(model="gpt-5-nano", temperature=0.5)
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

    template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

    Context: {context} 

    Question: {question} 

    Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)
    
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    print("RAG chain created successfully.")
    return rag_chain

In [83]:
qa_chain = create_rag_chain(vectorstore)

RAG chain created successfully.


In [86]:
our_query = "Which skill will be launched by government to create employability and train youth?"
answer = qa_chain.invoke(our_query)
print(answer)

Pradhan Mantri Kaushal Vikas Yojana 4.0 (PMKVY 4.0) will be launched to skill lakhs of youth and enhance their employability.


In [65]:
import os
from dotenv import load_dotenv

# --- Document Loading and Splitting ---
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Core LangChain Components ---
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# --- Integrations ---
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore

# 1. Load Environment Variables from .env file
load_dotenv()

# --- DATA INGESTION AND PROCESSING ---

def load_documents(directory='documents/'):
    """Loads PDF documents from a specified directory."""
    print("Loading documents...")
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    print(f"Loaded {len(documents)} document(s).")
    return documents

def split_documents(documents):
    """Splits documents into smaller chunks."""
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunked_documents = text_splitter.split_documents(documents)
    print(f"Split into {len(chunked_documents)} chunks.")
    return chunked_documents

# --- VECTOR STORE SETUP ---

def setup_vectorstore(chunked_documents):
    """Initializes Pinecone vector store and ingests documents."""
    index_name = "langchainvector"
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    print(f"Creating/updating Pinecone vector store '{index_name}'...")
    # This will create embeddings and upload them to Pinecone
    # If the index already exists, it will be updated with the new documents
    vectorstore = PineconeVectorStore.from_documents(
        documents=chunked_documents,
        embedding=embeddings,
        index_name=index_name
    )
    print("Vector store is ready.")
    return vectorstore

# --- RAG CHAIN SETUP ---

def create_rag_chain(vectorstore):
    """Creates a RAG chain using LCEL."""
    print("Creating RAG chain...")
    
    # Initialize the latest OpenAI model
    llm = ChatOpenAI(model="gpt-4o")
    
    # Create a retriever to fetch relevant documents from the vector store
    retriever = vectorstore.as_retriever()

    # Define the prompt template
    template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

    Context: {context} 

    Question: {question} 

    Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)

    # Create the RAG chain using LangChain Expression Language (LCEL)
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    print("RAG chain created successfully.")
    return rag_chain

# --- MAIN EXECUTION ---

if __name__ == '__main__':
    # Step 1: Load and process the documents
    docs = load_documents()
    chunks = split_documents(docs)
    
    # Step 2: Setup the vector store
    vector_store = setup_vectorstore(chunks)
    
    # Step 3: Create the RAG chain
    qa_chain = create_rag_chain(vector_store)
    
    # Step 4: Ask a question
    query = "How much the agriculture target will be increased by how many crore?"
    print(f"\nQuerying the chain with: '{query}'")
    answer = qa_chain.invoke(query)
    
    print("\n--- Answer ---")
    print(answer)

Loading documents...
Loaded 58 document(s).
Splitting documents into chunks...
Split into 119 chunks.
Creating/updating Pinecone vector store 'langchainvector'...
Vector store is ready.
Creating RAG chain...
RAG chain created successfully.

Querying the chain with: 'How much the agriculture target will be increased by how many crore?'

--- Answer ---
The agriculture credit target will be increased to ₹20 lakh crore.
