In [1]:
%pwd

'c:\\Users\\sujee\\Desktop\\New folder (2)\\Mental-Health-Chatbot-using-LLM-Pinecone-and-Langchain\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\sujee\\Desktop\\New folder (2)\\Mental-Health-Chatbot-using-LLM-Pinecone-and-Langchain'

In [4]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
#Extract data from the PDF
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data=load_pdf_file('data/')

In [7]:
extracted_data

[Document(metadata={'source': 'data\\book_data.pdf', 'page': 0, 'page_label': 'i'}, page_content=''),
 Document(metadata={'source': 'data\\book_data.pdf', 'page': 1, 'page_label': 'ii'}, page_content='The GALE\nENCYCLOPEDIA of\nMENTAL\nDisorders'),
 Document(metadata={'source': 'data\\book_data.pdf', 'page': 2, 'page_label': 'iii'}, page_content='ELLEN THACKERY AND MADELINE HARRIS, EDITORS\nThe GALE\nENCYCLOPEDIA of\nMENTAL\nDisorders\nVOLUME\nA-L\n1'),
 Document(metadata={'source': 'data\\book_data.pdf', 'page': 3, 'page_label': 'iv'}, page_content='The GALE\nENCYCLOPEDIA of\nMENTAL\nDisorders\nVOLUME\nM-Z\n2\nELLEN THACKERY AND MADELINE HARRIS, EDITORS'),
 Document(metadata={'source': 'data\\book_data.pdf', 'page': 4, 'page_label': 'v'}, page_content='The Gale Encyclopedia of Mental Disorders\nProject Editor\nEllen Thackery\nEditorial\nDeirdre S. Blanchfield, Madeline Harris, Kate\nKretschmann, Brigham Narins, Mark Springer\nEditorial Systems Support\nAndrea Lopeman\nPermissions\nMar

In [8]:
len(extracted_data)

1189

In [9]:
#Split Data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("length of my chunks:", len(text_chunks))

length of my chunks: 11389


In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    return embeddings

In [13]:

embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm





In [14]:
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [16]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [20]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc=Pinecone(api_key=PINECONE_API_KEY)
index_name = "book-database"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [21]:
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY

In [22]:
from langchain_pinecone import PineconeVectorStore #Embed each chunk and upsert the embedding into Pinecone index

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [23]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [24]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x288ce485a20>

In [25]:
retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [26]:
retriever_docs=retriever.invoke("What is Autism")

In [27]:
retriever_docs

[Document(id='17e22e01-7d67-4ee3-ae3d-8761937e50d7', metadata={'page': 112.0, 'page_label': '97', 'source': 'data\\book_data.pdf'}, page_content='(800) 233-4050. Web site: <http://chadd.org>.\nPaula Anne Ford-Martin, M.A.\nLaith Farid Gulli, M.D.\nNicole Mallory, M.S.,PA-C\nAutism\nDefinition\nThe term “autism” refers to a cluster of conditions\nappearing early in childhood. All involve severe impair-\nments in social interaction, communication, imaginative\nabilities, and rigid, repetitive behaviors. To be considered\nan autistic disorder, some of these impairments must be\nmanifest before the age of three.'),
 Document(id='3731445a-66e9-44b1-ac43-c9c60f9ae759', metadata={'page': 113.0, 'page_label': '98', 'source': 'data\\book_data.pdf'}, page_content='or odd.\nIn adolescence and adulthood, some of the higher-\nfunctioning individuals with autistic disorders may\nappear overly formal and polite. They may react with lit-\ntle spontaneity, as if social interaction doesn’t come nat-\nur

In [56]:
from dotenv import load_dotenv
load_dotenv()

True

In [57]:
groq_api_key=os.environ.get('GROQ_API_KEY')

In [59]:
from langchain_groq import ChatGroq
llm = ChatGroq(groq_api_key=groq_api_key,model_name="llama3-8b-8192")

In [61]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [62]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [63]:
response = rag_chain.invoke({"input": "What is Autism?"})
print(response["answer"])

According to the provided context, Autism refers to a cluster of conditions appearing early in childhood, involving severe impairments in social interaction, communication, imaginative abilities, and rigid, repetitive behaviors, with some of these impairments manifesting before the age of three.


In [64]:
response = rag_chain.invoke({"input": "What is Asperger’s disorder?"})
print(response["answer"])

Asperger's disorder is a developmental disorder of childhood that is characterized by difficulties in social interactions and repetitive patterns of behavior and activities. It is similar to autism, but children with Asperger's do not have the same difficulties in acquiring language that children with autism have.
