In [1]:
import os

os.chdir('../')

In [2]:
%pwd

'c:\\Users\\Rattapong.Pojpatin\\OneDrive - Interpublic\\Documents\\GitHub\\Data-Science-Portfolio\\Projects\\Generative AI'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob = '*.pdf',
        loader_cls = PyPDFLoader
    )

    documents = loader.load()

    return documents

In [5]:
extracted_data = load_pdf_file(data = 'Data/')

In [6]:
# extracted_data

In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print(f'Length of Text Chucks: {len(text_chunks)}')

Length of Text Chucks: 5860


In [9]:
# text_chunks

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2') # 384 dimensional vector
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2') # 384 dimensional vector
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query_result = embeddings.embed_query('Test')
print(len(query_result))

384


In [14]:
from dotenv import load_dotenv

load_dotenv()

True

In [15]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
HUGGINGFACE_API_KEY = os.environ.get('HUGGINGFACE_API_KEY')

In [17]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = 'testbot'

# pc.create_index(
#     name = index_name,
#     dimension = 384,
#     metric = 'cosine',
#     spec = ServerlessSpec(
#         cloud = 'aws',
#         region = 'us-east-1'
#     )
# )

In [18]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['huggingfacehub_api_token'] = HUGGINGFACE_API_KEY

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [20]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [21]:
retriever = docsearch.as_retriever(search_type = 'similarity', search_kwargs = {'k': 3})

In [22]:
retrieved_doc = retriever.invoke('What is Acne?')

In [40]:
from langchain_openai import OpenAI
from transformers import pipeline
from langchain.llms import HuggingFaceHub

# llm = OpenAI(temperature = 0.4, max_tokens = 500)
# hf_pipeline = pipeline(
#     "text-generation", 
#     model="mistralai/Mistral-7B-Instruct-v0.1",
#     use_auth_token=True
# )
llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.1", 
    model_kwargs={
        "temperature": 0.4, 
        "max_new_tokens": 500,
        "do_sample": True,  # Ensure sampling is enabled
        "return_full_text": False  # Avoid repeating input
    },
    huggingfacehub_api_token=HUGGINGFACE_API_KEY  
)


In [106]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
import re

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "You must use ONLY the provided retrieved context to answer the question.  "
    "- If the answer is NOT in the context, respond with: **'I don’t know.'** "
    "- Do NOT generate any additional information.  "
    "- If the answer is in the context, answer concisely with no more than 3 sentences.  "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "Context: {context}"
)

# system_prompt = """You are an assistant for question-answering tasks.  
# You must use ONLY the provided retrieved context to answer the question.  

# - If the answer is in the context, answer concisely.  
# - If the answer is NOT in the context, respond with: **"I don’t know."**  
# - Do NOT generate any additional information.  

# Context:  
# {context}  

# Question: {input}  
# Answer:"""




# system_prompt = (
#     "You are an assistant for question-answering tasks. "
#     "Use the following pieces of retrieved context to answer the question. "
#     "If the answer is not in the retrieved context, say: 'I don't know.' "
#     "Use three sentences maximum and keep the answer concise."
#     "\n\n"
#     "{context}\n\n"
#     "User: {input}\n"
#     "Assistant:"
# )

# prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", system_prompt),
#         ("human", "{input}")
#     ]
# )

# system_prompt = """You are an assistant for question-answering tasks. 
# Use only the following retrieved context to answer the user's question. 
# If the answer is not in the retrieved context, say: 'I don't know.' 
# Keep your answer concise and limited to three sentences. 

# Context: {context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    ("assistant", "")  # Ensure the model knows to generate after this
])


In [107]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [113]:
response = rag_chain.invoke({"input": "Who is Harry Potter?"})



In [114]:
print(response['answer'])

 I don’t know.


In [110]:
response

{'input': 'What is Covariance?',
 'context': [Document(id='8d64c95d-96b2-4944-b56e-41ab17845215', metadata={'page': 451.0, 'page_label': '452', 'source': 'Data\\Medical_book.pdf'}, page_content='American Medical Association. 515 N. State St., Chicago, IL\n60612. (312) 464-5000. <http://www.ama-assn.org>.\nJoseph Knight, PA\nBalance and coordination tests\nDefinition\nBalance is the ability to maintain a position. Coordi-\nnation is the capacity to move through a complex set of\nmovements. Balance and coordination depend on the\ninteraction of multiple body organs and systems includ-\ning the eyes, ears, brain and nervous system, cardiovas-\ncular system, and muscles. Tests or examination of any'),
  Document(id='43aaf717-1029-453b-a283-366e476baa32', metadata={'page': 451.0, 'page_label': '452', 'source': 'Data\\Medical_book.pdf'}, page_content='American Medical Association. 515 N. State St., Chicago, IL\n60612. (312) 464-5000. <http://www.ama-assn.org>.\nJoseph Knight, PA\nBalance and

In [78]:
retrieved_docs = retriever.invoke("Who is Harry Potter?")
print("Retrieved Docs:\n", retrieved_docs)

Retrieved Docs:
 [Document(id='16bba5e4-6042-43ec-9377-4028e4130a5d', metadata={'page': 111.0, 'page_label': '112', 'source': 'Data\\Medical_book.pdf'}, page_content='quite low in alcoholic patients, and deficiency of thiamine\nis responsible for the Wernicke-Korsakoff syndrome.\nAfter cessation of drinking has been accomplished,\nthe next steps involve helping the patient avoid ever tak-\ning another drink. This phase of treatment is referred to\nas rehabilitation . The best programs incorporate the\nfamily into the therapy, because the family has undoubt-\nedly been severely affected by the patient’s drinking.'), Document(id='f58170a9-5941-4684-9cf9-0c1ddc2c5dc4', metadata={'page': 111.0, 'page_label': '112', 'source': 'Data\\Medical_book.pdf'}, page_content='quite low in alcoholic patients, and deficiency of thiamine\nis responsible for the Wernicke-Korsakoff syndrome.\nAfter cessation of drinking has been accomplished,\nthe next steps involve helping the patient avoid ever tak-\nin