RetrievalQAWithSourcesChain: retrieves answers and tracks their sources.

In [9]:
import requests
from newspaper import Article
import time
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/"
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

In [6]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [7]:
my_activeloop_org_id = "srishtysuman2919"
my_activeloop_dataset_name = "langchain_course_qabot_with_source"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

Using embedding function is deprecated and will be removed in the future. Please use embedding instead.


Your Deep Lake dataset has been successfully created!


 

In [8]:
# We split the article texts into small chunks. While doing so, we keep track of each
# chunk metadata (i.e. the URL where it comes from). Each metadata is a dictionary and
# we need to use the "source" key for the document source so that we can then use the
# RetrievalQAWithSourcesChain class which will automatically retrieve the "source" item
# from the metadata dictionary.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({ "source": d["url"] })

# we add all the chunks to the deep lake, along with their metadata
db.add_texts(all_texts, all_metadatas)

Creating 49 embeddings in 1 batches of size 49:: 100%|██████████| 1/1 [00:40<00:00, 40.24s/it]

Dataset(path='hub://srishtysuman2919/langchain_course_qabot_with_source', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (49, 1)      str     None   
 metadata     json      (49, 1)      str     None   
 embedding  embedding  (49, 1536)  float32   None   
    id        text      (49, 1)      str     None   





['ab5e6052-c678-11ee-a35e-acde48001122',
 'ab5e6200-c678-11ee-a35e-acde48001122',
 'ab5e62d2-c678-11ee-a35e-acde48001122',
 'ab5e6322-c678-11ee-a35e-acde48001122',
 'ab5e6372-c678-11ee-a35e-acde48001122',
 'ab5e63ae-c678-11ee-a35e-acde48001122',
 'ab5e63f4-c678-11ee-a35e-acde48001122',
 'ab5e6458-c678-11ee-a35e-acde48001122',
 'ab5e64a8-c678-11ee-a35e-acde48001122',
 'ab5e64e4-c678-11ee-a35e-acde48001122',
 'ab5e6520-c678-11ee-a35e-acde48001122',
 'ab5e655c-c678-11ee-a35e-acde48001122',
 'ab5e65a2-c678-11ee-a35e-acde48001122',
 'ab5e65de-c678-11ee-a35e-acde48001122',
 'ab5e661a-c678-11ee-a35e-acde48001122',
 'ab5e664c-c678-11ee-a35e-acde48001122',
 'ab5e6728-c678-11ee-a35e-acde48001122',
 'ab5e67be-c678-11ee-a35e-acde48001122',
 'ab5e67fa-c678-11ee-a35e-acde48001122',
 'ab5e6822-c678-11ee-a35e-acde48001122',
 'ab5e6854-c678-11ee-a35e-acde48001122',
 'ab5e6890-c678-11ee-a35e-acde48001122',
 'ab5e68cc-c678-11ee-a35e-acde48001122',
 'ab5e68f4-c678-11ee-a35e-acde48001122',
 'ab5e694e-c678-

In [10]:
# we create a RetrievalQAWithSourcesChain chain, which is very similar to a
# standard retrieval QA chain but it also keeps track of the sources of the
# retrieved documents
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)


# from_chain_type method: arguments-> 1. LLM: instance of a model (GPT-3 for eg;)
#                                     2. chain_type: type of chain being used, which influences how the model processes the retrieved documents and generates responses. 
#                                     3. retriever: sets up the retriever that will fetch the relevant documents from the Deep Lake database. 
chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever())

# We generate a response to a query using the chain. The response object is a dictionary containing
# an "answer" field with the textual answer to the query, and a "sources" field containing a string made
# of the concatenation of the metadata["source"] strings of the retrieved documents.
d_response = chain({"question": "What does Geoffrey Hinton think about recent trends in AI?"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)


  warn_deprecated(
  warn_deprecated(


Response:
 Geoffrey Hinton, known as the "Godfather of AI," has expressed concerns about the potential dangers of AI and left his position at Google to discuss them openly. He has warned about the rapid development of generative AI products and the potential for false information to be spread. He also has concerns about the impact of AI on the job market. Other experts, such as Elon Musk, Neil deGrasse Tyson, and Stephen Hawking, have also expressed concerns about the risks of AI. 

Sources:
- https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/
