In [8]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document
from langchain_core.messages import HumanMessage
from langchain.indexes import VectorstoreIndexCreator
from langchain.tools.retriever import create_retriever_tool
import chromadb
import chromadb.config

In [2]:
load_dotenv('.env')

def google_search_actor(question):
    # Returns a list of urls from a question on google search using apify's google search actor
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)

    # prepare google search actor input
    run_input = {
        "queries": question,
        "maxPagesPerQuery": 1,
        "resultsPerPage": 9,
        "mobileResults": False,
        "languageCode": "",
        "maxConcurrency": 10,
        "saveHtml": False,
        "saveHtmlToKeyValueStore": False,
        "includeUnfilteredResults": False,
        "customDataFunction": """async ({ input, $, request, response, html }) => {
        return {
        pageTitle: $('title').text(),
        };
    };""",
    }

    # running the actor
    serp = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

    url_list = []
    
    # Creating a url list of websites for content crawling
    for item in client.dataset(serp["defaultDatasetId"]).list_items().items[0]['organicResults']:
        if len(item) > 0:
            url_list.append(item['url'])

    print("Query:", question ,"has",len(url_list), "urls")

    return url_list

In [3]:
def content_crawler(urls):
    # Takes a list of urls and web scrapes using Apify's content crawler
    # Load .env variables and initialize apifyclient with API token
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)
    
    content_crawler_input = {
        "startUrls": [],
        "useSitemaps": False,
        "crawlerType": "playwright:firefox",
        "includeUrlGlobs": [],
        "excludeUrlGlobs": [{ "url": "https://www.britannica.com"}],
        "ignoreCanonicalUrl": False,
        "maxCrawlDepth": 2,
        "maxCrawlPages": 4,
        "initialConcurrency": 0,
        "maxConcurrency": 200,
        "initialCookies": [],
        "proxyConfiguration": { "useApifyProxy": True },
        "maxSessionRotations": 10,
        "maxRequestRetries": 3,
        "requestTimeoutSecs": 60,
        "dynamicContentWaitSecs": 10,
        "maxScrollHeightPixels": 5000,
        "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
        [role=\"alert\"],
        [role=\"banner\"],
        [role=\"dialog\"],
        [role=\"alertdialog\"],
        [role=\"region\"][aria-label*=\"skip\" i],
        [aria-modal=\"true\"]""",
        "removeCookieWarnings": True,
        "clickElementsCssSelector": "[aria-expanded=\"false\"]",
        "htmlTransformer": "readableText",
        "readableTextCharThreshold": 100,
        "aggressivePrune": False,
        "debugMode": False,
        "debugLog": False,
        "saveHtml": False,
        "saveMarkdown": True,
        "saveFiles": False,
        "saveScreenshots": False,
        "maxResults": 9999999,
        "clientSideMinChangePercentage": 15,
        "renderingTypeDetectionPercentage": 10,
    }

    for url in urls:
        content_crawler_input["startUrls"].append({"url": url})
        print("Scraping", url)

    # Run the content crawler actor and wait for it to finish
    website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

    # Loads the dataset into langchain document format
    loader = ApifyDatasetLoader(
        dataset_id= website_content_crawler["defaultDatasetId"],
        dataset_mapping_function=lambda dataset_item: Document(
            page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
        )
    )
    return loader

In [4]:
def flatten_comprehension(matrix):
    return [item for row in matrix for item in row]

In [5]:
topic = input("Enter your topic:" )

url_list = []
url_list.append(google_search_actor(topic))
flat_url_list = flatten_comprehension(url_list)
loader = content_crawler(flat_url_list)

Query: Lebron James has 8 urls
Scraping https://en.wikipedia.org/wiki/LeBron_James
Scraping https://www.espn.com/nba/player/_/id/1966/lebronjames
Scraping https://www.instagram.com/kingjames/?hl=en
Scraping https://twitter.com/KingJames?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor
Scraping https://www.lebronjames.com/
Scraping https://www.nba.com/player/2544/lebron-james/
Scraping https://www.basketball-reference.com/players/j/jamesle01.html
Scraping https://www.facebook.com/LeBron/


In [10]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

x = loader.load()
docs = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(x)

db = Chroma.from_documents(docs, OpenAIEmbeddings())

In [12]:
retriever = db.as_retriever()

retriever_tool = create_retriever_tool(
    retriever,
    "google_search_crawler",
    "Search for information related to the webscraped query. For more complex questions regarding a query use this tool!"
)

tools = [
    retriever_tool
    ]

In [13]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = "gpt-3.5-turbo", temperature = 0)


from langchain import hub
# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages


from langchain.agents import create_openai_functions_agent

agent = create_openai_functions_agent(llm, tools, prompt)

In [14]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

agent_executor.invoke({"input": "Hello find me two facts on Lebron James and tell me where you got the information from"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `google_search_crawler` with `{'query': 'Lebron James facts'}`


[0m[36;1m[1;3m</p>
<table class="infobox vcard" style="width:26em"><caption class="infobox-title fn summary">LeBron James</caption><tbody><tr><td colspan="2" class="infobox-image"><span typeof="mw:File"><a href="/wiki/File:LeBron_James_(51959977144)_(cropped2).jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/LeBron_James_%2851959977144%29_%28cropped2%29.jpg/250px-LeBron_James_%2851959977144%29_%28cropped2%29.jpg" decoding="async" width="250" height="290" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/LeBron_James_%2851959977144%29_%28cropped2%29.jpg/375px-LeBron_James_%2851959977144%29_%28cropped2%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7a/LeBron_James_%2851959977144%29_%28cropped2%29.jpg/500px-LeBron_James_%2851959977144%29_%28cropped2%29.jpg 2

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-3.5-turbo in organization org-LJxqvqYHWmXevhgqyqRkW8fR on tokens per min (TPM): Limit 60000, Requested 366806. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}