In [13]:
import requests
from newspaper import Article 
import time
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.agents.tools import Tool

from langchain.chat_models import ChatOpenAI
from langchain_experimental.plan_and_execute import PlanAndExecute, load_agent_executor, load_chat_planner


In [2]:
# We scrape several Artificial Intelligence news


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/23/meta-open-source-speech-ai-models-support-over-1100-languages/",
    "https://www.artificialintelligence-news.com/2023/05/18/beijing-launches-campaign-against-ai-generated-misinformation/"
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

In [4]:
# We'll use an embedding model to compute our documents' embeddings

# We'll store the documents and their embeddings in the deep lake vector db

# Setup deep lake
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [5]:
# create Deep Lake dataset
my_activeloop_org_id = "srishtysuman2919"
my_activeloop_dataset_name = "langchain_course_analysis_outline"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)


Using embedding function is deprecated and will be removed in the future. Please use embedding instead.


Your Deep Lake dataset has been successfully created!


 

In [7]:
# We split the article texts into small chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts = []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)

In [8]:
# we add all the chunks to the Deep lake
db.add_texts(all_texts)


Creating 40 embeddings in 1 batches of size 40:: 100%|██████████| 1/1 [00:35<00:00, 35.72s/it]

Dataset(path='hub://srishtysuman2919/langchain_course_analysis_outline', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (40, 1)      str     None   
 metadata     json      (40, 1)      str     None   
 embedding  embedding  (40, 1536)  float32   None   
    id        text      (40, 1)      str     None   





['2c29e01c-c66f-11ee-9bf2-acde48001122',
 '2c29e0d0-c66f-11ee-9bf2-acde48001122',
 '2c29e10c-c66f-11ee-9bf2-acde48001122',
 '2c29e134-c66f-11ee-9bf2-acde48001122',
 '2c29e15c-c66f-11ee-9bf2-acde48001122',
 '2c29e17a-c66f-11ee-9bf2-acde48001122',
 '2c29e1a2-c66f-11ee-9bf2-acde48001122',
 '2c29e1de-c66f-11ee-9bf2-acde48001122',
 '2c29e210-c66f-11ee-9bf2-acde48001122',
 '2c29e238-c66f-11ee-9bf2-acde48001122',
 '2c29e260-c66f-11ee-9bf2-acde48001122',
 '2c29e27e-c66f-11ee-9bf2-acde48001122',
 '2c29e2a6-c66f-11ee-9bf2-acde48001122',
 '2c29e2c4-c66f-11ee-9bf2-acde48001122',
 '2c29e2ec-c66f-11ee-9bf2-acde48001122',
 '2c29e30a-c66f-11ee-9bf2-acde48001122',
 '2c29e328-c66f-11ee-9bf2-acde48001122',
 '2c29e350-c66f-11ee-9bf2-acde48001122',
 '2c29e378-c66f-11ee-9bf2-acde48001122',
 '2c29e396-c66f-11ee-9bf2-acde48001122',
 '2c29e3b4-c66f-11ee-9bf2-acde48001122',
 '2c29e3d2-c66f-11ee-9bf2-acde48001122',
 '2c29e3fa-c66f-11ee-9bf2-acde48001122',
 '2c29e418-c66f-11ee-9bf2-acde48001122',
 '2c29e440-c66f-

In [10]:
# Get the retriever object from the deep lake db object and set the number
# of retrieved documents to 3
retriever = db.as_retriever()
retriever.search_kwargs['k'] = 3

In [11]:
# We define some variables that will be used inside our custom tool
CUSTOM_TOOL_DOCS_SEPARATOR ="\n---------------\n" # how to join together the retrieved docs to form a single string

# This is the function that defines our custom tool that retrieves relevant
# docs from Deep Lake
def retrieve_n_docs_tool(query: str) -> str:
    """Searches for relevant documents that may contain the answer to the query."""
    docs = retriever.get_relevant_documents(query)
    texts = [doc.page_content for doc in docs]
    texts_merged = "---------------\n" + CUSTOM_TOOL_DOCS_SEPARATOR.join(texts) + "\n---------------"
    return texts_merged

In [14]:
# We create the tool that uses the "retrieve_n_docs_tool" function
tools = [
    Tool(
        name="Search Private Docs",
        func=retrieve_n_docs_tool,
        description="useful for when you need to answer questions about current events about Artificial Intelligence"
    )
]


# let's create the Plan and Execute agent
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
planner = load_chat_planner(model)
executor = load_agent_executor(model, tools, verbose=True)
agent = PlanAndExecute(planner=planner, executor=executor, verbose=True)

# we test the agent
response = agent.run("Write an overview of Artificial Intelligence regulations by governments by country")




  warn_deprecated(
  warn_deprecated(




[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Research and gather information on the current state of Artificial Intelligence (AI) regulations by governments in different countries.'), Step(value='Organize the information by country and categorize it based on the level of AI regulation.'), Step(value='Provide an overview of the AI regulations in each country, including any specific laws or policies that have been implemented.'), Step(value='Include information on the key areas covered by the regulations, such as data privacy, algorithmic transparency, liability, and ethical considerations.'), Step(value='Highlight any notable differences or similarities between the regulations in different countries.'), Step(value='Summarize the overall trends and developments in AI regulations globally.'), Step(value='Given the above steps taken, provide an overview of Artificial Intelligence regulations by governments by country. ')]

[1m> Entering new AgentExecutor chain...[0