In [48]:
import os
import requests
import csv
from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_community.document_loaders.base import Document
from langchain_core.messages import HumanMessage
from langchain.indexes import VectorstoreIndexCreator

In [49]:
load_dotenv('.env')

def google_search_actor(question):
    # Returns a list of urls from a question on google search using apify's google search actor
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)

    # prepare google search actor input
    run_input = {
        "queries": question,
        "maxPagesPerQuery": 3,
        "resultsPerPage": 5,
        "mobileResults": False,
        "languageCode": "",
        "maxConcurrency": 10,
        "saveHtml": False,
        "saveHtmlToKeyValueStore": False,
        "includeUnfilteredResults": False,
        "customDataFunction": """async ({ input, $, request, response, html }) => {
        return {
        pageTitle: $('title').text(),
        };
    };""",
    }

    # running the actor
    serp = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)

    url_list = []
    
    # Creating a url list of websites for content crawling
    for item in client.dataset(serp["defaultDatasetId"]).list_items().items[0]['organicResults']:
        if len(item) > 0:
            url_list.append(item['url'])

    print(len(url_list), "urls")

    return url_list

In [50]:
def content_crawler(urls):
    # Takes a list of urls and web scrapes using Apify's content crawler
    # Load .env variables and initialize apifyclient with API token
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')

    # Initalize apify client
    client = ApifyClient(APIFY_API_KEY)
    
    content_crawler_input = {
        "startUrls": [],
        "useSitemaps": False,
        "crawlerType": "playwright:firefox",
        "includeUrlGlobs": [],
        "excludeUrlGlobs": [{ "url": "https://www.britannica.com"}],
        "ignoreCanonicalUrl": False,
        "maxCrawlDepth": 2,
        "maxCrawlPages": 4,
        "initialConcurrency": 0,
        "maxConcurrency": 200,
        "initialCookies": [],
        "proxyConfiguration": { "useApifyProxy": True },
        "maxSessionRotations": 10,
        "maxRequestRetries": 3,
        "requestTimeoutSecs": 60,
        "dynamicContentWaitSecs": 10,
        "maxScrollHeightPixels": 5000,
        "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
        [role=\"alert\"],
        [role=\"banner\"],
        [role=\"dialog\"],
        [role=\"alertdialog\"],
        [role=\"region\"][aria-label*=\"skip\" i],
        [aria-modal=\"true\"]""",
        "removeCookieWarnings": True,
        "clickElementsCssSelector": "[aria-expanded=\"false\"]",
        "htmlTransformer": "readableText",
        "readableTextCharThreshold": 100,
        "aggressivePrune": False,
        "debugMode": False,
        "debugLog": False,
        "saveHtml": False,
        "saveMarkdown": True,
        "saveFiles": False,
        "saveScreenshots": False,
        "maxResults": 9999999,
        "clientSideMinChangePercentage": 15,
        "renderingTypeDetectionPercentage": 10,
    }

    for url in urls:
        content_crawler_input["startUrls"].append({"url": url})
        print("adding", url, "to startUrls")

    # Run the content crawler actor and wait for it to finish
    website_content_crawler = client.actor("aYG0l9s7dbB7j3gbS").call(run_input= content_crawler_input)

    # Loads the dataset into langchain document format
    loader = ApifyDatasetLoader(
        dataset_id= website_content_crawler["defaultDatasetId"],
        dataset_mapping_function=lambda dataset_item: Document(
            page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
        )
    )

    index = VectorstoreIndexCreator().from_loaders([loader])


    return index

In [23]:
def get_prompts(query):
    chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
    output = chat.invoke(
        [
            HumanMessage(
                content = f"""You are an AI language model assistant. Your task is to generate two 
        different versions of the given user question to retrieve relevant documents from a vector 
        database. By generating multiple perspectives on the user question, your goal is to help
        the user overcome some of the limitations of the distance-based similarity search. 
        Provide these alternative Google Search prompts separated by newlines.
        Query: {query}"""
            )
        ]
    )

    output_string = output.content
    output_list = output_string.split('\n')

    for i in range(len(output_list)):
        output_list[i] = output_list[i][3:]
    
    return output_list

In [47]:
index = content_crawler(google_search_actor("Steve Jobs")[:3])


# Code for getting summaries from our dataset
user_query = input("Enter user query: ")
result = index.query_with_sources(user_query)

print(result["answer"])
print(result["sources"])

 Steve Jobs was an American businessman, inventor, and investor best known for co-founding Apple Inc. He passed away in 2011 and was posthumously awarded the Presidential Medal of Freedom in 2022. 

https://en.wikipedia.org/wiki/Steve_Jobs, https://www.britannica.com/biography/Steve-Jobs, https://www.apple.com/stevejobs/


In [18]:
def run_actors(query):
    chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)
    output = chat.invoke(
        [
            HumanMessage(
                content = f"""You are an AI language model assistant. Your task is to generate two 
        different versions of the given user question to retrieve relevant documents from a vector 
        database. By generating multiple perspectives on the user question, your goal is to help
        the user overcome some of the limitations of the distance-based similarity search. 
        Provide these alternative Google Search prompts separated by newlines.
        Query: {query}"""
            )
        ]
    )

    output_string = output.content
    output_list = output_string.split('\n')

    for i in range(len(output_list)):
        output_list[i] = output_list[i][3:]

    all_url_list = []
    
    for question in output_list:
        print("Adding query:", question)
        url_list.append(google_search_actor(question))

    print("Urls:", url_list)
    
    docs = content_crawler(url_list)
    
    return docs