In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")


In [2]:
from langchain_groq import ChatGroq
model= ChatGroq(
    model="llama3-70b-8192",
    api_key=GROQ_API_KEY,
    temperature=0.9,
)

In [5]:
%pip install pypdf
from langchain_community.document_loaders import PyPDFLoader

file_path = "./data/Be_Good.pdf"

loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

Collecting pypdf
  Using cached pypdf-5.7.0-py3-none-any.whl.metadata (7.2 kB)
Using cached pypdf-5.7.0-py3-none-any.whl (305 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.7.0
Note: you may need to restart the kernel to use updated packages.
11


In [6]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

Be Good - Essay by Paul Graham
Be Good
Be good
April 2008(This essay is derived from a talk at the 2
{'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20240613143635', 'title': 'Be Good - Essay by Paul Graham', 'author': 'Paul Graham', 'source': './data/Be_Good.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}


RAG


In [11]:
%pip install langchain-chroma


Collecting langchain-chroma
  Using cached langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting chromadb>=1.0.9 (from langchain-chroma)
  Using cached chromadb-1.0.15-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting build>=1.0.3 (from chromadb>=1.0.9->langchain-chroma)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pybase64>=1.4.1 (from chromadb>=1.0.9->langchain-chroma)
  Using cached pybase64-1.4.1-cp311-cp311-win_amd64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb>=1.0.9->langchain-chroma)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=1.0.9->langchain-chroma)
  Using cached onnxruntime-1.22.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb>=1.0.9->langchain-chroma)
  Using cached opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (fr

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'd:\\anaconda\\envs\\llmapp\\Lib\\site-packages\\onnxruntime\\quantization\\calibrate.py'
Consider using the `--user` option or check the permissions.



In [13]:
%pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.3.0-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached

In [15]:
from langchain_chroma import Chroma
#hugging face embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

retriever = vectorstore.as_retriever()

In [17]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(model, prompt)

rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "What is this article about?"})

results["answer"]

'This article, "Be Good" by Paul Graham, explores the idea that making something people want and not worrying too much about making money can lead to a surprising conclusion: that businesses and charities have similar goals. The author discusses how this idea can be applied to various contexts, including startups and addressing social issues like malaria.'

In [18]:
results

{'input': 'What is this article about?',
 'context': [Document(id='34d77361-4707-46aa-93e2-918c71b1169c', metadata={'creator': 'PyPDF', 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'author': 'Paul Graham', 'total_pages': 11, 'creationdate': 'D:20240613143635', 'page': 0, 'title': 'Be Good - Essay by Paul Graham', 'source': './data/Be_Good.pdf'}, page_content="Be Good - Essay by Paul Graham\nBe Good\nBe good\nApril 2008(This essay is derived from a talk at the 2008 Startup School.)About a month after we\nstarted Y Combinator we came up with the\nphrase that became our motto: Make something people want.  We've\nlearned a lot since then, but if I were choosing now that's still\nthe one I'd pick.Another thing we tell founders is not to worry too much about the\nbusiness model, at least at first.  Not because making money is\nunimportant, but because it's so much easier than building something\ngreat.A couple weeks ago I realized that if you put those two ide

In [19]:
results = rag_chain.invoke({"input": "What is the main message of the article?"})

results["answer"]

'The main message of the article "Be Good" by Paul Graham is that being good and doing what is best for users is a key to success for startups, and that focusing on making something people want rather than worrying about making money can lead to a successful business model.'