In [97]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [98]:

def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [100]:
# os.chdir("CHATBOT_GENERATIVE_AI")

In [101]:
%pwd

'/Users/shashanksnaik/Desktop/Chatbot_gen_AI/Chatbot_Generative_AI'

In [102]:
extracted_data = load_pdf("data/")

In [103]:
# extracted_data

In [104]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [105]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 6973


In [106]:
# text_chunks

In [107]:
from huggingface_hub import hf_hub_download


In [108]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [109]:
embeddings = download_hugging_face_embeddings()

In [110]:
# query_result = embeddings.embed_query("Hello world")
# print("length" , len(query_result))
# #dimention is 384

In [111]:
from dotenv import load_dotenv
load_dotenv()

True

In [112]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [113]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [114]:
pc = Pinecone(api_key=PINECONE_API_KEY)

# Set the index name
index_name = "chatbot"

# Create the index with the specified configuration
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': 'c1d8a5b73f7f565a6cd52a20beff3690', 'date': 'Mon, 16 Jun 2025 20:20:01 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [115]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [116]:
from langchain_pinecone import PineconeVectorStore

# Assuming `text_chunks` and `embeddings` are already defined
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name, 
    embedding=embeddings
)


In [117]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name, 
    embedding=embeddings
)

In [118]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x3c0e7a1a0>

In [119]:
retriever = docsearch.as_retriever(search_type = 'similarity', search_kwargs = {"k" : 3})

In [120]:
retrieved_docs = retriever.invoke("what is Acne?")

In [121]:
retrieved_docs

[Document(id='536ade14-3bc0-4852-9074-4a44b5513ae1', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 425.0, 'page_label': '426', 'producer': 'GPL Ghostscript 9.10', 'source': 'data/Book.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteriod —A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(id='4178c821-84f0-496c-90b6-5e5c7c48b7d8', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '20

In [122]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens= 500)

In [123]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate



system_prompt = (
    "You are an assistant for question answering task, "
    "use the following pieces of retrieved contex to answer. "
    "the question. If you don't kniw the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise. "
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [124]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [126]:
response = rag_chain.invoke({"input" : "what is acne?"
""})
print(response['answer'])



Acne is a common skin condition caused by clogged pores and inflammation. It can result in pimples, blackheads, and whiteheads on the face, chest, and back. It is often treated with topical medications or oral antibiotics.
