In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
from fpdf import FPDF

### STEP 1: Scrape data and save as a JSON file ###

# URL to scrape
url = 'https://en.wikipedia.org/wiki/97th_Academy_Awards'

response = requests.get(url)
page_content = response.content

# Create a BeautifulSoup object
soup = BeautifulSoup(page_content, 'html.parser')

# Extract all <p> tags and combine their text
paragraphs = soup.find_all('p')
text_content = ' '.join([para.get_text() for para in paragraphs])

print(text_content)  # This will print the text extracted from the page


 
 The 97th Academy Awards ceremony, presented by the Academy of Motion Picture Arts and Sciences (AMPAS), took place on March 2, 2025, at the Dolby Theatre in Hollywood, Los Angeles. During the gala, the AMPAS presented Academy Awards (commonly referred to as Oscars) in 23 categories, honoring films released in 2024. The ceremony was televised in the United States by ABC and streamed on Hulu for the first time.[4][5][6] Comedian Conan O'Brien hosted the show for the first time, with Raj Kapoor and Katy Mullan returning as executive producers.[7][8][9][10][11]
 Anora won a leading five awards, including Best Picture.[12] Other winners included The Brutalist with three awards; Dune: Part Two, Emilia Pérez, and Wicked with two awards each; and Conclave, Flow, I'm Not a Robot, I'm Still Here, In the Shadow of the Cypress, No Other Land, The Only Girl in the Orchestra, A Real Pain, and The Substance with one each.[12] The telecast drew 19.69 million viewers in the United States.[3]
 The n

In [2]:
#Import a text splitter class from LangChain.
#This class recursively splits documents by paragraph, sentence, or character while preserving meaning.
from langchain.text_splitter import RecursiveCharacterTextSplitter
#chunk_size=1000: Each chunk will be up to 1000 characters long.
#chunk_overlap=20: Each chunk will overlap 20 characters with the next, helping maintain context between chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
chunks = text_splitter.split_text(text_content)

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.cache import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

# Set up the cache store
store = LocalFileStore("./cache/")

# Initialize the Hugging Face embedding model
core_embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Use the cache-backed embedder with the Hugging Face model
embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model,
    store,
    namespace=core_embeddings_model.model_name
)

  core_embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Store embeddings in the Pinecone vector store
from langchain_pinecone import PineconeVectorStore
import os
from pinecone import  Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()    
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = "ragchatbot2" 


# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Huggingface embeddings = 384
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Create vectorstore
for i in range(0, len(chunks), 100):  # upload 100 at a time
    vectorstore= PineconeVectorStore.from_texts(
        chunks[i:i+100],
        embedding=embedder,
        index_name=index_name
    )

# Query
query = "What is Pinecone used for?"
results = vectorstore.similarity_search(query)

# Print top result
print("Answer:", results[0].page_content)

# Instantiate a retriever from the vector store
retriever = vectorstore.as_retriever()

Answer: In introducing the "In Memoriam" montage, Morgan Freeman also gave an individual spoken tribute to Gene Hackman.[75] The montage, which featured the Los Angeles Master Chorale performing "Lacrimosa" from Mozart's Requiem, paid tribute to the following individuals:[76]


In [9]:
# from langchain_postgres import PGVector

# # # See docker command above to launch a postgres instance with pgvector enabled.
# connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
# collection_name = "oscar"

# vector_store = PGVector(
#    embeddings=embedder,
#    collection_name=collection_name,
#   connection=connection,
#   use_jsonb=True,
#  )


# # Query
# # query = "who won oscar?"
# # results = vector_store.similarity_search(query)

# # # Print top result
# # print("Answer:", results[0].page_content)
# results = vector_store.similarity_search("What is pgvector?")
# for r in results:
#     print(r.page_content)
# # Instantiate a retriever from the vector store
# retriever = vector_store.as_retriever()
# retriever = vector_store.as_retriever()
# results = retriever.get_relevant_documents("oscars")
# print("Results:", len(results))
# for res in results:
#     print(res.page_content)


In [16]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
import os
# this formats the docs returned by the retriever
def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

# prompt to send to the LLM
prompt = """You are an assistant for question-answering tasks.
    	Use the following pieces of retrieved context to answer the question.
    	If you don't know the answer, search in google  .

    	Question: {question}

    	Context: {context}

    	Answer:
    	"""

prompt_template = ChatPromptTemplate.from_template(prompt)

llm = ChatGroq(
    model_name="llama3-70b-8192", streaming=True, groq_api_key=os.getenv("GROQ_API_KEY")
)
"""This code defines a chain where input documents are first formatted,
then passed through a prompt template,
and finally processed by an LLM."""

rag_chain_from_docs = (
	RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
		| prompt_template
	| llm
	)
"""This code creates a parallel process:
one retrieves the context (using a retriever),
and the other passes the question through unchanged.
The results are then combined and assigned to the variable `answer` using the `rag_chain_from_docs` processing chain."""

rag_chain_with_source = RunnableParallel(
	{"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [17]:
#Retrieves relevant documents (like the first example).
#Then sends those documents as context to an LLM (Groq/LLaMA-3).
#The LLM reads the context and generates a natural language answer.
response = rag_chain_with_source.invoke("when did the 97th Academy Awards ceremony took place?")
print(response['answer'].content)
print(response['context'][2].page_content)

The 97th Academy Awards ceremony took place on March 2, 2025.
The nominees for the 97th Academy Awards were announced on January 23, 2025, at the Samuel Goldwyn Theater in Beverly Hills, by actress Rachel Sennott and actor Bowen Yang.[14] Emilia Pérez led all nominees with thirteen nominations, the most for a non-English-language film in Oscars history, while The Brutalist and Wicked tied for second with ten nominations each.[15] The winners were announced during the awards ceremony on March 2, 2025.[12]


In [18]:
#similarity_search
#Only retrieves relevant documents from Pinecone (your vector store) using vector similarity.
#It does not pass the results to an LLM.
#It gives you raw text chunks — you handle the response manually if needed.
query = "when did the 97th Academy Awards ceremony took place?"
docs = vectorstore.similarity_search(query)

#print results

print(docs[0].page_content)

The 97th Academy Awards ceremony, presented by the Academy of Motion Picture Arts and Sciences (AMPAS), took place on March 2, 2025, at the Dolby Theatre in Hollywood, Los Angeles. During the gala, the AMPAS presented Academy Awards (commonly referred to as Oscars) in 23 categories, honoring films released in 2024. The ceremony was televised in the United States by ABC and streamed on Hulu for the first time.[4][5][6] Comedian Conan O'Brien hosted the show for the first time, with Raj Kapoor
