## Semantic Search using ChatGPT + LangChain

### Import all the necessary libraries for performing Semantic Search

In [1]:
import os
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
import warnings
warnings.filterwarnings(action='ignore')

### Use a certificate for communicating with OpenAPI

In [2]:
os.environ["REQUESTS_CA_BUNDLE"] = r"../../ca-bundle-full.crt"

In [3]:
# Load, chunk, and index the contents of the blog. 
# Here we are using Wikipedia as an external source and trying to analyze Rahul Dravid's page
# Below code extracts the HTML body contents from the given URL
loader = WebBaseLoader(
    web_paths=("https://en.wikipedia.org/wiki/Rahul_Dravid",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("body")
    ),
)
docs = loader.load()

In [4]:
# Check whether we have downloaded the right document or not
docs[0].page_content.replace("\n","")[2000:3000].replace("\t"," ")

'lture, see Dravidian.Rahul DravidDravid in 2012Personal informationFull\xa0nameRahul Sharad DravidBorn (1973-01-11) 11 January 1973 (age\xa051)Indore, Madhya Pradesh, IndiaNicknameThe Wall, The Great Wall, Jammy, Mr. Dependable[1]Height1.80\xa0m (5\xa0ft 11\xa0in)BattingRight-handedBowlingRight arm off breakRoleBatsman, Part-time wicket-keeperWebsitewww.rahuldravid.comInternational informationNational sideIndia (1996–2012)Test debut (cap\xa0207)20 June 1996\xa0v\xa0EnglandLast Test24 January 2012\xa0v\xa0AustraliaODI debut (cap\xa095)3 April 1996\xa0v\xa0Sri LankaLast ODI16 September 2011\xa0v\xa0EnglandODI shirt no.19 (previously 5)Only T20I (cap\xa038)31 August 2011\xa0v\xa0EnglandT20I shirt no.19Domestic team informationYearsTeam1990–2012Karnataka2000Kent2003Scottish Saltires2008–2010Royal Challengers Bangalore2011–2013Rajasthan RoyalsHead coaching informationYearsTeam2015–2021India U-192015–2021India A2021–IndiaCareer statisticsCompetitionTestODIFCLAMatches164344298449Runs scored1

### Split the entire document into mulitple chunks using RecursiveCharacterTextSplitter

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

### Create word embedding using and store them in ChromaDB

In [6]:
embeddings = AzureOpenAIEmbeddings(
    deployment="text-embedding-ada-002"
)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [7]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = AzureChatOpenAI(model_name="gpt-3.5-turbo", azure_deployment= "gpt-35-turbo")

In [8]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

### Create a RAG Chain and start asking questions

In [9]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
# First questions which was not anwered
rag_chain.invoke("How many runs Rahul Dravid scored in ODI?")

"I don't know."

In [11]:
# Trying to ask Height of Rahul Dravid, the response is correct
rag_chain.invoke("What is the height of Rahul Dravid?")

'The height of Rahul Dravid is 1.80 meters or 5 feet 11 inches.'

In [12]:
rag_chain.invoke("Which schoold Rahul Dravid attended?")

"Rahul Dravid attended St. Joseph's Boys High School and St Joseph's College of Commerce."

In [13]:
rag_chain.invoke("In which year Rahul Dravid made debut?")

'Rahul Dravid made his international debut in 1996.'

In [14]:
# I have fine tuned the below prompt to get relevant answer
rag_chain.invoke("Against which country Rahul Dravid was made a designated keeper?")

'Rahul Dravid was made a designated keeper against Pakistan.'

In [15]:
rag_chain.invoke("What was the cost of the prompts until now?")

'The cost of the prompts until now is unknown.'

In [16]:
from langchain.callbacks import get_openai_callback

In [17]:
with get_openai_callback() as cb:
    print(rag_chain.invoke("Tell me about Rahul Dravid in brief"))
    print(
        f"Total Cost (USD): ${format(cb.total_cost, '.4f')}"
    )

Rahul Dravid is a former Indian cricketer and current head coach of the Indian national team. He is known for his outstanding batting technique and is considered one of the greatest batsmen in the history of cricket. Dravid has scored 24,177 runs in international cricket and has been instrumental in the success of the Indian cricket team, including winning the 2002 ICC Champions Trophy.
Total Cost (USD): $0.0015
