In [1]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [11]:
os.environ["OPENAI_API_KEY"] = 'api_key'

In [3]:
llm = OpenAI(temperature=0.9, max_tokens=500)

In [4]:
news = UnstructuredURLLoader(urls=[
   "https://www.moneycontrol.com/news/politics/ipef-nations-conclude-talk-on-clean-economy-agreement-proposes-members-to-work-on-sustainable-measures-11757861.html",
   "https://www.moneycontrol.com/news/cricket/icc-world-cup-2023-australia-edge-south-africa-and-will-play-india-in-final-11757771.html"
])

df = news.load() 
len(df)

2

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(df)

In [6]:
len(docs)

46

In [7]:
docs[0]

Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nTrending Stocks\n\nSuzlon Energy\xa0INE040H01021, SUZLON, 532667\n\nYes Bank\xa0INE528G01035, YESBANK, 532648\n\nTata Power\xa0INE245A01021, TATAPOWER, 500400\n\nSBI\xa0INE062A01020, SBIN, 500112\n\nTata Motors\xa0INE155A01022, TATAMOTORS, 500570\n\nCheck your Credit Score here!\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, LoginHello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsMy Profile My PROMy PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsLogoutChat with UsDownload AppFollow us on:\n\nUpgrade\n\nMy Feed', metadata={'source': 'https://www.moneycontrol.com/news/politics/ipef-nations-conclude-talk-on-clean-economy-agreement-proposes-members-to-work-on-sustainable-measures-11757861.html'})

In [12]:
## Create embeddings for these chunks and save them to FAISS index
embeddings = OpenAIEmbeddings()

vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [13]:
# Storing vector index create in local
file_path="vector_idx.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [14]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [15]:
## Retrieve similar embeddings for a given question and call LLM to retrieve final answer
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [16]:
query = "When was IPEF was launched jointly by US?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

{'answer': ' IPEF was launched jointly by the US and other partner countries of the Indo-Pacific region on May 23, 2020.\n',
 'sources': 'https://www.moneycontrol.com/news/politics/ipef-nations-conclude-talk-on-clean-economy-agreement-proposes-members-to-work-on-sustainable-measures-11757861.html'}