In [None]:
# April 2025
# RAG with Langchain
# Website contents loading and question aswering on them
# 

In [None]:
from dotenv import load_dotenv
import os

# 1st method: using .env file.
load_dotenv()
# Access them using os.getenv or os.environ
api_key = os.getenv("GROQ_API_KEY")

# 2nd method: using hard code
# api_key = "<put the api key here>"
# if not os.environ.get("GROQ_API_KEY"):
#     os.environ["GROQ_API_KEY"] = api_key #getpass.getpass("Enter API key for Groq: ")




from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [None]:
# # If you want to use the depths of website, uncomment this. It takes too much time to read them
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urljoin, urlparse
# import time

# def crawl_links(base_url, max_depth=2):
#     visited = set()
#     to_visit = [(base_url, 0)]
#     internal_links = []

#     while to_visit:
#         current_url, depth = to_visit.pop(0)

#         if current_url in visited or depth > max_depth:
#             continue

#         try:
#             response = requests.get(current_url, timeout=10)
#             visited.add(current_url)
#         except Exception as e:
#             print(f"Failed to fetch {current_url}: {e}")
#             continue

#         if "text/html" not in response.headers.get("Content-Type", ""):
#             continue

#         soup = BeautifulSoup(response.text, "html.parser")

#         for link in soup.find_all("a", href=True):
#             href = link["href"]
#             full_url = urljoin(current_url, href)

#             # Normalize and filter to domain
#             if full_url.startswith(base_url):
#                 parsed_url = urlparse(full_url)._replace(fragment="").geturl()

#                 if parsed_url not in visited and parsed_url not in [u for u, _ in to_visit]:
#                     to_visit.append((parsed_url, depth + 1))
#                     internal_links.append(parsed_url)
#                     # print(parsed_url)

#         # time.sleep(1)  # Be kind to the server!

#     return list(set(internal_links))


# # Example usage:
# main_url = "https://www.dunedinnz.com"
# otago_links = crawl_links(main_url, max_depth=1)

# print(f"Found {len(otago_links)} internal links:")
# for link in otago_links:
#     print(link)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
# from langchain_community.chat_models import ChatGroq
import os


In [None]:
from langchain.document_loaders import WebBaseLoader

urls = [
    "https://www.npr.org/2025/04/13/nx-s1-5363702/bernie-sanders-warns-of-extraordinary-danger-facing-u-s-under-trump-at-la-rally",
    "https://www.otago.ac.nz/future-students/why-otago",
]

# urls = otago_links[:10]


loader = WebBaseLoader(urls)
documents = loader.load()


In [None]:
# documents

In [None]:
# !pip -q install selenium

In [None]:
from langchain.document_loaders import SeleniumURLLoader

loader = SeleniumURLLoader(urls=urls)
documents = loader.load()

In [None]:
# documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [None]:
from langchain.vectorstores import FAISS
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)


In [None]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(docs, embeddings)


In [None]:
vectorstore

In [None]:
retriever=vectorstore.as_retriever()

In [None]:
retriever.get_relevant_documents("how many courses are offered by Otago?")

In [None]:
from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatOpenAI


qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)


In [None]:
query = "How many students are studying at university of Otago?"
# query = "places to visit in Dunedin?"
answer = qa_chain.run(query)
print(answer)


In [None]:
# how to make it a bit more strict and never answer things made up or hullucination based?
# the answer is using a prompt and chain_type_kwargs argement

In [None]:
from langchain.prompts import PromptTemplate

promm =    """Given the following question and context, find the response. If the answer is not in the context
    don't make things up and say I don't know. 
    
    CONTEXT : {context}
    QUESTION :{question}
    """

PROMPT = PromptTemplate(   
        template=promm,
        input_variables=["context", "question"]
)


qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt":PROMPT}
)

In [None]:
query = "How many teachers are teaching at university of Otago?"
# query = "places to visit in Dunedin?"
answer = qa_chain.run(query)
print(answer)
