In [2]:
from langchain.document_loaders import  PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def read_pdf_file(data):
    loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [6]:
extracted_data = read_pdf_file("../pdf_data")

In [7]:
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks = split_text(extracted_data=extracted_data)
print(f"Text Chunks Length: {len(text_chunks)}")

Text Chunks Length: 85


In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
query_result = embeddings.embed_query("saurav")
len(query_result)

384

In [16]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os
from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "pdf-qa"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [17]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [18]:
# load existing index

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [20]:
retriver = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [None]:
retrived_docs = retriver.invoke("What is Adversarial Machine Learning?")

In [22]:
retrived_docs

[Document(id='435ffc3a-2fde-43d2-82e9-71669160c549', metadata={'page': 0.0, 'source': '..\\pdf_data\\aml_industry_perspective.pdf'}, page_content='Adversarial Machine Learning - Industry\nPerspectives\nRam Shankar Siva Kumar, Magnus Nystr ¨om,John Lambert, Andrew Marshall, Mario Goertzel,\nAndi Comissoneru, Matt Swann and Sharon Xia\nMicrosoft\nRedmond,USA\nEmail:atml@microsoft.com\nAbstract—Based on interviews with 28 organizations, we found\nthat industry practitioners are not equipped with tactical and\nstrategic tools to protect, detect and respond to attacks on their\nMachine Learning (ML) systems. We leverage the insights from'),
 Document(id='00be6047-d243-45c1-a9be-02f8bc258f69', metadata={'page': 0.0, 'source': '..\\pdf_data\\aml_industry_perspective.pdf'}, page_content='engineering\nI. I NTRODUCTION\nAdversarial Machine Learning is now having a moment in\nthe software industry - For instance, Google [1], Microsoft [2]\nand IBM [3] have signaled, separate from their commitment

sk-dbzA34QZ9lzF5NO3JU6xgDzvjmQ5uc3utb57HQs5yRT3BlbkFJYneZHM_VVuxof83SeUXdqibqHZ3VR_htbX5yNfp8IA


In [37]:
# from langchain_openai import OpenAI
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()

# print(os.environ.get("OPENAI_API_KEY"))

llm = ChatGroq(groq_api_key=os.environ.get("GROQ_API_KEY"),model="gemma2-9b-it")

In [38]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrived context to answer"
    "the question. If you don't know the answer, say that you"
    "don't know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"   
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)

In [39]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriver,question_answer_chain)

In [40]:
response = rag_chain.invoke({"input":"what is adversarial machine learning?"})
response["answer"]

'Adversarial machine learning is a field focused on protecting machine learning (ML) systems from malicious attacks.  These attacks aim to manipulate ML models by introducing subtle changes to their input data, causing them to produce incorrect or unexpected outputs.  Industry is recognizing the need to secure ML systems against these threats. \n\n\n'

In [41]:
response = rag_chain.invoke({"input":"what is saurav?"})
response["answer"]

'The provided text does not mention who Saurav is.  Therefore, I cannot answer your question.  Please provide more context or information about Saurav. \n'