<a href="https://colab.research.google.com/github/sarunsmenon/llm/blob/main/rmit_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [49]:
!pip install -q python-dotenv openai langchain-openai cohere langchain langchain_community pypdf faiss-gpu wikipedia-api faiss-cpu wikipedia langchainhub unstructured playwright uuid7

# Load Libraries

In [31]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [32]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv
from google.colab import userdata
import pickle

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [50]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool

from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor

from langchain import hub
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from uuid_extensions import uuid7str


# Load Variables

In [51]:
llm_model = 'gpt-3.5-turbo-0301'
llm = ChatOpenAI(model=llm_model, temperature=0)
os.environ['OPENAI_API_KEY'] = userdata.get('open_ai_key')
prompt_template = "hwchase17/openai-functions-agent"

os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_API_KEY']=userdata.get('langsmith_api_key')

fldr = '/content/drive/MyDrive/Colab Notebooks/Langchain/rmit/'
session_id = uuid7str()

In [35]:
# Start crawling from the initial URL
start_url = 'https://www.rmit.edu.au/'
ignore_lst = ['rmit.edu.vn', 'rmit.edu.eu', 'rmit.eu']
max_pg_lmt = 500
db_name = "faiss_rmit_db"

# Load Functions

In [36]:
# Function to get all links from a page
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = [a.get('href') for a in soup.find_all('a', href=True)]
    full_links = [urljoin(url, link) for link in links]
    return full_links

In [37]:
# Function to crawl the website
def crawl_website(start_url, max_pages=max_pg_lmt):
    itr = 0

    visited = set()
    to_visit = [start_url]

    while to_visit and len(visited) < max_pages:
      url = to_visit.pop(0)
      if any( [url for site in ignore_lst if site in url]) or (url in visited):
          continue
      visited.add(url)
      try:
        links = get_all_links(url)
        to_visit.extend(links)
      except:
        continue

      itr += 1
      if itr % 100 == 0:
        print(f"Visited {len(visited)}: {url}")

    return visited

In [38]:
def extract_process_url(url):
  loader = UnstructuredURLLoader(urls=[url])
  data = loader.load()

  text_splitter = CharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=5,
                separator= "\n\n",
                length_function=len,
                is_separator_regex=False
              )

  docs = text_splitter.split_documents(data)
  return docs


In [39]:
def store_doc_into_db(docs, faiss_rmit_db):
  if faiss_rmit_db is None:
    faiss_rmit_db = FAISS.from_documents(docs, OpenAIEmbeddings())
  else:
    faiss_rmit_db.add_documents(docs)

  return faiss_rmit_db


In [69]:
def generate_chat_response(message, local_session_id):
  result = agent_with_chat_history.invoke({"input": message}, config={"configurable": {"session_id": local_session_id}})
  return result['output']

# Create URL Lists

In [None]:
visited_urls = crawl_website(start_url)
print(f"Total visited URLs: {len(visited_urls)}")

Visited 101: https://www.rmit.edu.au/life-at-rmit/student-for-a-day
Visited 202: https://www.rmit.edu.au/students/my-course
Visited 302: https://www.rmit.edu.au/alumni/alumni-faqs
Visited 402: https://rmit.primo.exlibrisgroup.com/discovery/account?vid=61RMIT_INST:RMITU§ion=overview&lang=en
Total visited URLs: 500


In [None]:
with open(fldr+"url_list.pkl", "wb") as f:
    pickle.dump(visited_urls, f)

In [11]:
with open(fldr+"url_list.pkl", "rb") as f:
    visited_urls = list(pickle.load(f))

error_url_lst = []

In [12]:
print(f"Total visited URLs: {len(visited_urls)}")

Total visited URLs: 500


# Create Database

In [14]:
if os.path.exists(fldr+db_name):
  faiss_rmit_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
  print("DB already exists")
else:
  faiss_rmit_db = None
  print("create a new database because none exists")


DB already exists


In [None]:
for url in visited_urls[0:20]:
  print(url)
  try:
    docs = extract_process_url(url)
    faiss_rmit_db = store_doc_into_db(docs, faiss_rmit_db)
  except:
    error_url_lst.append(url)


https://www.rmit.edu.au/news/all-news/2024/july/telstra-health-aged-care
https://www.rmit.edu.au/study-with-us/business/digital3
https://www.rmit.edu.au/library/study/copyright-advice
https://www.rmit.edu.au/library/about-and-contacts/computers-and-study-spaces
https://www.rmit.edu.au/library/research/strategic-publishing
https://rmit.primo.exlibrisgroup.com/permalink/61RMIT_INST/1b6r78u/alma9915302290001341
https://www.rmit.edu.au/library#tab3
https://online.rmit.edu.au/courses?utm_campaign=RMIT%20Central&utm_source=rmit&utm_medium=referral&utm_content=mega-nav
https://www.rmit.edu.au/library/borrowing-and-collections/collections
https://www.rmit.edu.au/about/our-locations-and-facilities/locations/overseas/indonesia
https://www.rmit.edu.au/library/about-and-contacts/makerspace
https://www.rmit.edu.au/library/research/advice-training-and-support
https://outlook.office.com/
https://www.rmit.edu.au/study-with-us/levels-of-study/research-programs
https://www.rmit.edu.au/staff
https://www.

# Write Everything

In [None]:
print(faiss_rmit_db)

None


In [None]:
FAISS.save_local(faiss_rmit_db, fldr+db_name)

In [None]:
with open(fldr+"error_urls.pkl", "wb") as f:
  pickle.dump(error_url_lst, f)

# Create Retriever

In [40]:
if os.path.exists(fldr+db_name):
  faiss_rmit_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
else:
  faiss_rmit_db = None

In [41]:
primary_retriever = faiss_rmit_db.as_retriever( search_type="similarity_score_threshold",
                            search_kwargs={"score_threshold": 0.5,"k": 1}
)

In [None]:
primary_retriever.invoke("aged care prize")

[Document(metadata={'source': 'https://www.rmit.edu.au/news/all-news/2024/july/telstra-health-aged-care'}, page_content='The innovation reliably detects deterioration in frail aged care residents, marking the introduction of clinical decision support software to predict deterioration – already used in acute care settings – into aged care.\n\nThe tool has the capacity to automatically monitor both structured and free-text electronic patient records for 36 evidence-based indicators of deterioration, making it the most reliable tool of its kind developed to date.\n\nAs well as providing aged care staff with a frailty index for each resident, the system can also identify specific alerts for their risk of falls, depression and mortality.\n\nAlessandro Luongo (DHCRC), Hui Mathews (DHCRC), Dr Tabinda Sarwar (RMIT), Vickie Irving (Telstra Health), Dr Jocelyn Ling (DHCRC), Dr Clare Morgan (DHCRC), Judith Ngai (DHCRC) at the Cooperative Research Australia awards in Brisbane.')]

In [None]:
primary_retriever.invoke("how to get rmit scholarship")

[Document(metadata={'source': 'https://www.rmit.edu.au/study-with-us/applying-to-rmit/local-student-applications/scholarships'}, page_content='Workforce development\nCollaborate with RMIT\nResearch partnerships\nFacilities, equipment and services\nContact Industry Engagement\nGiving to RMIT\nInternational\nStudy in Australia\nApply to RMIT as an international student\nCourses for international students\nInternational student enquiries\nFees and scholarships for international students\nInternational student services\nKey dates for international students\nFrequently asked questions\nScholarships\nAchieve your study goals with a scholarship. Each year scholarships are awarded to thousands of new and continuing students across RMIT program areas.\nAchieve your study goals with a scholarship. Each year scholarships are awarded to thousands of new and continuing students across RMIT program areas.\nStudy with us / Applying to RMIT / Local student applications / Scholarships\nAchieve your stu

# Agents

In [42]:
rmit_tool = create_retriever_tool(
    primary_retriever,
    "rmit_search",
    "Search for information about RMIT"
)

In [43]:
tools = [rmit_tool ]

In [44]:
# Get the prompt to use - you can modify this!
prompt = hub.pull(prompt_template)
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [45]:
agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# Chat History

In [25]:
agent_with_chat_history = RunnableWithMessageHistory(
    agent_executor,
    # This is needed because in most real world scenarios, a session id is needed
    # It isn't really used here because we are using a simple in memory ChatMessageHistory
    lambda session_id: ChatMessageHistory(),
    input_messages_key="input",
    history_messages_key="chat_history",
)

# Execution

In [68]:
input_qn  = "Hi"
while input_qn != "exit":
  print(f" Input qn is : {input_qn}")
  input_qn = input("Enter a message (to finish use exit): ")
  print(generate_chat_response(input_qn, session_id))

 Input qn is : Hi
Enter a message (to finish use exit): did rmit win any aged care prize


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `rmit_search` with `{'query': 'RMIT aged care prize'}`


[0m[36;1m[1;3mTrailblazing digital health collaboration to improve aged care wins national prize

SKIP TO CONTENT

Search

RMIT Australia

RMIT Europe

RMIT Global

RMIT Vietnam

Study online

Students

Alumni

Staff

Library

RMIT Online

Courses

Courses by study area

Undergraduate courses

Postgraduate courses

Vocational studies

Short courses

Pre-university studies

Online courses and degrees

Entry pathways

Courses for international students

Study with us

How to apply

Fees

Scholarships

School leaver information

Student services

Student experience

Key dates

Frequently asked questions

Parents

Career advisers

Life at RMIT

Study experience

Student life

Support for students

Global opportunities

Industry connections

About

News

Events

Maps

Caree

# Testing

In [None]:
from langchain_community.document_loaders import PlaywrightURLLoader

urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    "https://goo.gl/maps/NDSHwePEyaHMFGwh8",
]

loader = PlaywrightURLLoader(urls=visited_urls, remove_selectors=["header", "footer"])

data = await loader.aload()


ERROR:langchain_community.document_loaders.url_playwright:Error fetching or processing https://www.rmit.edu.au/staff, exception: Locator.all: Execution context was destroyed, most likely because of a navigation


CancelledError: 

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import requests

# Initialize embeddings and FAISS vector store
embeddings = OpenAIEmbeddings()
faiss_store = FAISS(embedding_function=embeddings)

# Function to fetch content from a URL and add it to the FAISS vector store
def fetch_and_add_to_faiss(url, faiss_store):
    try:
        response = requests.get(url)
        response.raise_for_status()
        text = response.text
        faiss_store.add_texts([text])
        print(f"Added content from {url} to FAISS store.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")



TypeError: FAISS.__init__() missing 3 required positional arguments: 'index', 'docstore', and 'index_to_docstore_id'

In [None]:
# Process each URL and add to the FAISS store
for url in visited_urls:
    fetch_and_add_to_faiss(url, faiss_store)

print(f"FAISS vector store now contains {faiss_store.index.ntotal} vectors.")

# Save the FAISS vector store
faiss_store.save("faiss_store.pkl")


In [None]:
with open(fldr+"visited_urls.pkl", "wb") as f:
    pickle.dump(visited_urls, f)

In [None]:

with open(fldr+"visited_urls.pkl", "rb") as f:
    visited_urls = pickle.load(f)