<a href="https://colab.research.google.com/github/sarunsmenon/llm/blob/main/telstra_support_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [41]:
!pip install -q python-dotenv openai langchain-openai cohere langchain langchain_community pypdf faiss-gpu wikipedia-api faiss-cpu wikipedia langchainhub unstructured playwright uuid7 langgraph gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.7/318.7 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import textwrap

In [3]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
from langchain_core.messages import BaseMessage, AIMessage

In [4]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv
from google.colab import userdata
import pickle

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader, WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool

from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor

from langchain import hub
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory

from uuid_extensions import uuid7str
from langchain_core.pydantic_v1 import BaseModel, Field
import traceback


In [6]:
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder
)

from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent, AgentType

In [7]:
from langchain.chains import RetrievalQAWithSourcesChain

In [42]:
import gradio as gr

# Load Variables

In [9]:
os.environ['OPENAI_API_KEY'] = userdata.get('open_ai_key')
prompt_template = "hwchase17/openai-functions-agent"

os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_API_KEY']=userdata.get('langsmith_api_key')

session_id = uuid7str()

In [10]:
llm_model = 'gpt-3.5-turbo-1106'
llm = ChatOpenAI(model=llm_model, temperature=0)

In [11]:
# Start crawling from the initial URL
start_url = 'https://www.telstra.com.au/support'
ignore_lst = []
include_lst = ['support' ,'telstra']
max_pg_lmt = 5000
db_name = "faiss_telstra_support_db"
fldr = '/content/drive/MyDrive/Colab Notebooks/Langchain/telstra_support/'

In [12]:
hist_store= {}

In [13]:
if not os.path.exists(fldr):
    # If the folder does not exist, create it
    os.makedirs(fldr)
    print(f'Folder created at: {fldr}')
else:
    print(f'Folder already exists at: {fldr}')

Folder already exists at: /content/drive/MyDrive/Colab Notebooks/Langchain/telstra_support/


# Load Functions

In [14]:
def load_history():
  with open(fldr+"history.pkl", "rb") as f:
    hist_store = pickle.load(f)
  if hist_store is None:
    hist_store = {}
  print(hist_store)
  return hist_store

In [15]:
def store_history():
  with open(fldr+"history.pkl", "wb") as f:
    pickle.dump(hist_store, f)

In [16]:
# Function to get all links from a page
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = [a.get('href') for a in soup.find_all('a', href=True)]
    full_links = [urljoin(url, link) for link in links]
    return full_links

In [17]:
# Function to crawl the website
def crawl_website(start_url, max_pages=max_pg_lmt):
    itr = 0

    visited = set()
    to_visit = [start_url]

    while to_visit and len(visited) < max_pages:
      url = to_visit.pop(0)

      if (
          (url not in visited) and
          ("telstra.com.au" in url) and
          ("support" in url) and
          ("mobilesupport.telstra.com.au" not in url)
        ):
        visited.add(url)
        try:
          links = get_all_links(url)
          to_visit.extend(links)
        except:
          continue

        itr += 1
        if itr % 10 == 0:
          print(f"Visited {len(visited)}: {url}")

    return visited

In [18]:
def extract_process_url(url):
  loader = UnstructuredURLLoader(urls=[url])
  data = loader.load()

  text_splitter = CharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=5,
                separator= "\n\n",
                length_function=len,
                is_separator_regex=False
              )

  docs = text_splitter.split_documents(data)
  return docs


In [19]:
def store_doc_into_db(docs, faiss_rmit_db):
  if faiss_rmit_db is None:
    faiss_rmit_db = FAISS.from_documents(docs, OpenAIEmbeddings())
  else:
    faiss_rmit_db.add_documents(docs)

  return faiss_rmit_db


In [55]:
def generate_chat_response(message, local_session_id):
  result = agent_with_chat_history.invoke({"input": message}, config={"configurable": {"session_id": local_session_id}})
  print(result)
  return result['output']

In [21]:
def get_by_session_id(session_id: str) -> BaseChatMessageHistory:
    if session_id not in hist_store:
        hist_store[session_id] = InMemoryHistory()
    return hist_store[session_id]

In [22]:
def get_user_and_retrieve_history():
  # user_name = input("Enter your username : ")
  history = get_by_session_id(user_name)
  return history

def get_user():
  user_name = input("Enter your username : ")
  return user_name

In [23]:
class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: List[BaseMessage] = Field(default_factory=list)

    def add_messages(self, messages: List[BaseMessage]) -> None:
        """Add a list of messages to the store"""
        self.messages.extend(messages)

    def clear(self) -> None:
        self.messages = []

# Load History

In [24]:
if os.path.exists(fldr+"history.pkl"):
  hist_store = load_history()
  print(f'Folder already exists at: {fldr}')

{'sam': InMemoryHistory(messages=[])}
Folder already exists at: /content/drive/MyDrive/Colab Notebooks/Langchain/telstra_support/


In [58]:
hist_store

{'sam': InMemoryHistory(messages=[HumanMessage(content='How to pay bill?'), AIMessage(content="To pay your Telstra bill, you can do the following:\n\n1. Sign in to My Telstra to pay your bills online. This allows for easy bill payment while you are on the go.\n\n2. Set up direct debit to avoid late fees for missed payments. Automatic payments take the stress out of remembering to pay your bill, so you can set it and forget it.\n\n3. Pay using BPAY:\n   - If you already have BPAY, sign in to your online banking, choose the BPAY payment option, and enter the Telstra BPAY biller code (7773), BPAY reference number (your account number at the bottom of your bill), and the amount you want to pay.\n   - If you'd like to set up BPAY, sign in to your financial institution's website, search for BPAY, and follow the prompts. Alternatively, you can contact your financial institution directly.\n\n4. Pay over the phone.\n\nFor more details, you can visit the Telstra website or contact Telstra custom

# Create URL Lists

In [None]:
visited_urls = crawl_website(start_url)
print(f"Total visited URLs: {len(visited_urls)}")

Visited 16: https://www.telstra.com.au/small-business/online-support/business-software
Visited 26: https://www.telstra.com.au/support/account-payment/remove-service-restriction
Visited 37: https://www.telstra.com.au/support/account-payment/make-a-complaint
Visited 51: https://www.telstra.com.au/support/internet-and-home-phone/home-internet-order-next-steps


In [None]:
with open(fldr+"url_list.pkl", "wb") as f:
    pickle.dump(visited_urls, f)

In [None]:
with open(fldr+"url_list.pkl", "rb") as f:
    visited_urls = list(pickle.load(f))

error_url_lst = []

In [None]:
print(f"Total visited URLs: {len(visited_urls)}")

# Create Database

In [None]:
if os.path.exists(fldr+db_name):
  faiss_telstra_support_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
  print("DB already exists")
else:
  faiss_telstra_support_db = None
  print("create a new database because none exists")


In [None]:
for url in visited_urls[0:5000]:
  print(url)
  try:
    docs = extract_process_url(url)
    faiss_telstra_support_db = store_doc_into_db(docs, faiss_telstra_support_db)
  except:
    error_url_lst.append(url)


# Write Everything

In [None]:
print(faiss_telstra_support_db)

<langchain_community.vectorstores.faiss.FAISS object at 0x7ca7940fdf00>


In [None]:
FAISS.save_local(faiss_telstra_support_db, fldr+db_name)

In [None]:
with open(fldr+"error_urls.pkl", "wb") as f:
  pickle.dump(error_url_lst, f)

# Create Retriever

In [27]:
if os.path.exists(fldr+db_name):
  faiss_telstra_support_db = FAISS.load_local(fldr+db_name,embeddings=OpenAIEmbeddings(), allow_dangerous_deserialization=True)
else:
  faiss_telstra_support_db = None

In [28]:
primary_retriever = faiss_telstra_support_db.as_retriever( search_type="similarity_score_threshold",
                            search_kwargs={"score_threshold": 0.5,"k": 1}
)


In [29]:
qn = " will my pixel 4xl work after 3g exit"
#qn = "What's the 3g exit plan"

In [30]:
primary_retriever.invoke(qn)

[Document(metadata={'source': 'https://www.telstra.com.au/support/mobiles-devices/3g-closure#lightbox-mandarin'}, page_content='To access the benefits of our 4G network, you need to be using compatible devices. This includes using appropriate devices for your requirements – for example, if currently using a Blue Tick 3G device you’ll need a Blue Tick 4G device, or if currently relying on a 3G T-Go or TMSA coverage extension device, you’ll need an equivalent 4G device to ensure equivalent coverage.\n\nOur 3G network will remain open until 31 August 2024.\n\nIf you hear a recorded message on your phone about our 3G network closure, you have a device that needs to be upgraded to stay connected after the network closes.\n\nYou’ll hear the message if your device relies on the 3G network, doesn’t support Voice over 4G (VoLTE), or is 4G-enabled but uses 3G for emergency calls.\n\nWe can’t remove the message, as it’s vitally important to ensure you’re using a device capable of contacting Tripl

# Agents

In [31]:
system_message_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=[],
        template=f"""

        You are a helpful assistant for parents enquiring about Telstra Products. This tool may also be used by kids. So the result should be polite and helpful.
        If you cant find enough info start with 'I dont know'.

        For any questions that are not related to support from Telstra , just say - "Ask me only about Telstra". for generic questions refer them to use ChatGPT.

        Beneath the response also add the Document metadata source URL from where this was retrieved.
        """,
    )
)

# Define the Human Message Template
human_message_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["input"],
        template="{input}"
    )
)

# Create the Conversation Memory
memory = ConversationBufferMemory(return_messages=True)

# Define the Chatbot Template
chatbot_template = [
    system_message_prompt,
    MessagesPlaceholder(variable_name="chat_history", optional=True),
    human_message_prompt,
    MessagesPlaceholder(variable_name="agent_scratchpad")
]

prompt = ChatPromptTemplate.from_messages(chatbot_template)
prompt

ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, partial_variables={'chat_history': []}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='\n\n        You are a helpful assistant for parents enquiring about Telstra Products. This tool may also be used by kids. So the result sho

In [34]:
telsta_support_tool = create_retriever_tool(
    primary_retriever,
    "telstra_support_search",
    "Use this to answer any question about Telstra"
)

tools = [telsta_support_tool]

In [51]:
try:
  agent = create_tool_calling_agent(llm=llm, tools=tools, prompt=ChatPromptTemplate.from_messages(chatbot_template))
  agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False, )
except:
  traceback.print_exc()

# Chat History

In [52]:
agent_with_chat_history = RunnableWithMessageHistory(
    agent_executor,
    get_session_history =get_user_and_retrieve_history,
    input_messages_key="input",
    history_messages_key="chat_history"
    )

# Execution

In [38]:
user_name = get_user()
print(f"welcome :: {user_name}")

Enter your username : sam
welcome :: sam


In [None]:
try:
  input_qn  = "Hi Bot, How to Pay Bill"
  while input_qn != "exit":


    print("start chat")
    response = generate_chat_response(input_qn, user_name)
    print(response)
    print(textwrap.fill(response, 80))
    print("")
    input_qn = input("Enter a message (to finish use exit): ")
except:
  print(traceback.format_exc())
finally:
  store_history()

In [53]:
def predict(message, history):
    history_langchain_format = []
    for human, ai in get_user_and_retrieve_history():
        history_langchain_format.append(HumanMessage(content=human))
        history_langchain_format.append(AIMessage(content=ai))
    history_langchain_format.append(HumanMessage(content=message))
    gpt_response = generate_chat_response(message, user_name)
    store_history()
    return gpt_response

In [56]:
demo = gr.ChatInterface(
    predict,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me any qns on Telstra Products and Services", container=False, scale=7),
    title="Your PA to Telstra Support in Internet - Unoffical and unrelated to Telstra corporation",
    description="Your PA to Telstra Support in Internet",
    theme="soft",
    examples=["How to pay bill?", "3G exit"],
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear"
  )

Caching examples at: '/content/gradio_cached_examples/79'
Caching example 1/2
{'input': 'How to pay bill?', 'chat_history': [HumanMessage(content='How to pay bill?'), AIMessage(content="To pay your Telstra bill, you can do the following:\n\n1. Sign in to My Telstra to pay your bills online. This allows for easy bill payment while you are on the go.\n\n2. Set up direct debit to avoid late fees for missed payments. Automatic payments take the stress out of remembering to pay your bill, so you can set it and forget it.\n\n3. Pay using BPAY:\n   - If you already have BPAY, sign in to your online banking, choose the BPAY payment option, and enter the Telstra BPAY biller code (7773), BPAY reference number (your account number at the bottom of your bill), and the amount you want to pay.\n   - If you'd like to set up BPAY, sign in to your financial institution's website, search for BPAY, and follow the prompts. Alternatively, you can contact your financial institution directly.\n\n4. Pay over 

In [57]:
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://41d16748d7e8cc1c34.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




# Experiment Section