<a href="https://colab.research.google.com/github/sarunsmenon/llm/blob/main/telstra_support_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [1]:
!pip install -q python-dotenv openai langchain-openai cohere langchain langchain_community pypdf faiss-gpu wikipedia-api faiss-cpu wikipedia langchainhub unstructured playwright uuid7

# Load Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import textwrap

In [3]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
from langchain_core.messages import BaseMessage, AIMessage

In [4]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv
from google.colab import userdata
import pickle

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool

from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor

from langchain import hub
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory

from uuid_extensions import uuid7str
from langchain_core.pydantic_v1 import BaseModel, Field


# Load Variables

In [6]:
os.environ['OPENAI_API_KEY'] = userdata.get('open_ai_key')
prompt_template = "hwchase17/openai-functions-agent"

os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_API_KEY']=userdata.get('langsmith_api_key')

session_id = uuid7str()

In [7]:
llm_model = 'gpt-3.5-turbo-1106'
llm = ChatOpenAI(model=llm_model, temperature=0)

In [8]:
# Start crawling from the initial URL
start_url = 'https://www.telstra.com.au/support'
ignore_lst = []
include_lst = ['support' ,'telstra']
max_pg_lmt = 5000
db_name = "faiss_telstra_support_db"
fldr = '/content/drive/MyDrive/Colab Notebooks/Langchain/telstra_support/'

In [9]:
hist_store= {}

In [10]:
if not os.path.exists(fldr):
    # If the folder does not exist, create it
    os.makedirs(fldr)
    print(f'Folder created at: {fldr}')
else:
    print(f'Folder already exists at: {fldr}')

Folder already exists at: /content/drive/MyDrive/Colab Notebooks/Langchain/telstra_support/


# Load Functions

In [12]:
def load_history():
  with open(fldr+"history.pkl", "rb") as f:
    hist_store = pickle.load(f)

In [13]:
def store_history():
  with open(fldr+"history.pkl", "wb") as f:
    pickle.dump(hist_store, f)

In [14]:
# Function to get all links from a page
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = [a.get('href') for a in soup.find_all('a', href=True)]
    full_links = [urljoin(url, link) for link in links]
    return full_links

In [15]:
print(max_pg_lmt)

5000


In [16]:
# Function to crawl the website
def crawl_website(start_url, max_pages=max_pg_lmt):
    itr = 0

    visited = set()
    to_visit = [start_url]

    while to_visit and len(visited) < max_pages:
      url = to_visit.pop(0)

      if (
          (url not in visited) and
          ("telstra.com.au" in url) and
          ("support" in url) and
          ("mobilesupport.telstra.com.au" not in url)
        ):
        visited.add(url)
        try:
          links = get_all_links(url)
          to_visit.extend(links)
        except:
          continue

        itr += 1
        if itr % 10 == 0:
          print(f"Visited {len(visited)}: {url}")

    return visited

In [17]:
def extract_process_url(url):
  loader = UnstructuredURLLoader(urls=[url])
  data = loader.load()

  text_splitter = CharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=5,
                separator= "\n\n",
                length_function=len,
                is_separator_regex=False
              )

  docs = text_splitter.split_documents(data)
  return docs


In [18]:
def store_doc_into_db(docs, faiss_rmit_db):
  if faiss_rmit_db is None:
    faiss_rmit_db = FAISS.from_documents(docs, OpenAIEmbeddings())
  else:
    faiss_rmit_db.add_documents(docs)

  return faiss_rmit_db


In [19]:
def generate_chat_response(message, local_session_id):
  result = agent_with_chat_history.invoke({"input": message}, config={"configurable": {"session_id": local_session_id}})
  return result['output']

In [20]:
def get_by_session_id(session_id: str) -> BaseChatMessageHistory:
    if session_id not in hist_store:
        hist_store[session_id] = InMemoryHistory()
    return hist_store[session_id]


In [21]:
def get_user_and_retrieve_history():
  # user_name = input("Enter your username : ")
  history = get_by_session_id(user_name)
  return history

def get_user():
  user_name = input("Enter your username : ")
  return user_name

In [22]:
class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: List[BaseMessage] = Field(default_factory=list)

    def add_messages(self, messages: List[BaseMessage]) -> None:
        """Add a list of messages to the store"""
        self.messages.extend(messages)

    def clear(self) -> None:
        self.messages = []

# Load History

In [23]:
if os.path.exists(fldr+"history.pkl"):
  hist_store = load_history()
  print(f'Folder already exists at: {fldr}')

# Create URL Lists

In [None]:
visited_urls = crawl_website(start_url)
print(f"Total visited URLs: {len(visited_urls)}")

Visited 16: https://www.telstra.com.au/small-business/online-support/business-software
Visited 26: https://www.telstra.com.au/support/account-payment/remove-service-restriction
Visited 37: https://www.telstra.com.au/support/account-payment/make-a-complaint
Visited 51: https://www.telstra.com.au/support/internet-and-home-phone/home-internet-order-next-steps


In [None]:
with open(fldr+"url_list.pkl", "wb") as f:
    pickle.dump(visited_urls, f)

In [None]:
with open(fldr+"url_list.pkl", "rb") as f:
    visited_urls = list(pickle.load(f))

error_url_lst = []

In [None]:
print(f"Total visited URLs: {len(visited_urls)}")

# Create Database

In [None]:
if os.path.exists(fldr+db_name):
  faiss_telstra_support_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
  print("DB already exists")
else:
  faiss_telstra_support_db = None
  print("create a new database because none exists")


In [None]:
for url in visited_urls[0:5000]:
  print(url)
  try:
    docs = extract_process_url(url)
    faiss_telstra_support_db = store_doc_into_db(docs, faiss_telstra_support_db)
  except:
    error_url_lst.append(url)


# Write Everything

In [None]:
print(faiss_telstra_support_db)

<langchain_community.vectorstores.faiss.FAISS object at 0x7ca7940fdf00>


In [None]:
FAISS.save_local(faiss_telstra_support_db, fldr+db_name)

In [None]:
with open(fldr+"error_urls.pkl", "wb") as f:
  pickle.dump(error_url_lst, f)

# Create Retriever

In [24]:
if os.path.exists(fldr+db_name):
  faiss_telstra_support_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
else:
  faiss_telstra_support_db = None

In [25]:
primary_retriever = faiss_telstra_support_db.as_retriever( search_type="similarity_score_threshold",
                            search_kwargs={"score_threshold": 0.5,"k": 1}
)

In [26]:
primary_retriever.invoke("modem not working")

[Document(metadata={'source': 'https://www.telstra.com.au/support/internet-and-home-phone/check-smart-modem#main-content'}, page_content='When the modem boots up (usually 2-3 minutes) the mobile signal light on the top of the modem should light up which indicates the modem has successfully contacted the mobile network.\n\nStep 7: Not working? Contact us\n\nIf the mobile signal light does not light up or is red, contact us or message us through the My Telstra app.\n\nFrequently asked questions\n\nIf you have followed the step by step instructions above, and your mobile signal light is still off, please message us through the My Telstra app. One of our agents will help you.\n\nThere are currently three different generations of the Telstra Smart Modem. If you have any of these models (look on the back or base of your modem to find the model number), then you have a Telstra Smart Modem.\n\nThe Telstra Smart Modem 2.0 has ports arranged vertically down the back\n\nThe Telstra Smart Modem 1.

# Agents

In [27]:
telsta_support_tool = create_retriever_tool(
    primary_retriever,
    "telstra_support_search",
    "Search for support from Telstra"
)

In [28]:
tools = [telsta_support_tool ]

In [29]:
# Get the prompt to use - you can modify this!
prompt = hub.pull(prompt_template)
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [30]:
agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

# Chat History

In [31]:
agent_with_chat_history = RunnableWithMessageHistory(
    agent_executor,
    get_user_and_retrieve_history,
    input_messages_key="input",
    history_messages_key="chat_history",
)

# Execution

In [None]:

user_name = get_user()
print(f"welcome :: {user_name}")


try:
  input_qn  = "Hi Bot, I need help with Telstra Stuff"
  while input_qn != "exit":

    print("start chat")
    response = generate_chat_response(input_qn, user_name)
    print(textwrap.fill(response, 80))
    print("")
    input_qn = input("Enter a message (to finish use exit): ")
except:
  print(traceback.format_exc())
finally:
  store_history()

# Experiment Section